cue: complete JSON conformance
- accept escaped solidus
- accept matched escaped surrogate pairs
Closes #20
Change-Id: Ia04cb9b2ff9ff3006e4264d2eb32da71002a99ce
Reviewed-on: https://cue-review.googlesource.com/c/cue/+/1791
Reviewed-by: Marcel van Lohuizen <mpvl@google.com>
diff --git a/cue/literal/string.go b/cue/literal/string.go
index 3a7fea9..8a44800 100644
--- a/cue/literal/string.go
+++ b/cue/literal/string.go
@@ -28,6 +28,9 @@
errMissingNewline = errors.New(
"invalid string: opening quote of multiline string must be followed by newline")
errUnmatchedQuote = errors.New("invalid string: unmatched quote")
+ // TODO: making this an error is optional according to RFC 4627. But we
+ // could make it not an error if this ever results in an issue.
+ errSurrogate = errors.New("unmatched surrogate pair")
)
// Unquote interprets s as a single- or double-quoted, single- or multi-line
@@ -124,18 +127,19 @@
}
// Unquote unquotes the given string. It must be terminated with a quote or an
-// interpolation start.
+// interpolation start. Escape sequences are expanded and surrogates
+// are replaced with the corresponding non-surrogate code points.
func (q QuoteInfo) Unquote(s string) (string, error) {
if len(s) > 0 && !q.multiline {
if contains(s, '\n') || contains(s, '\r') {
return "", errSyntax
}
+
// Is it trivial? Avoid allocation.
- if s[len(s)-1] == q.char &&
- q.numHash == 0 &&
- !contains(s, '\\') &&
- !contains(s[:len(s)-1], q.char) {
- return s[:len(s)-1], nil
+ if s[len(s)-1] == q.char && q.numHash == 0 {
+ if s := s[:len(s)-1]; isSimple(s, rune(q.char)) {
+ return s, nil
+ }
}
}
@@ -163,11 +167,22 @@
continue
}
c, multibyte, ss, err := unquoteChar(s, q)
+ if surHigh <= c && c < surEnd {
+ if c >= surLow {
+ return "", errSurrogate
+ }
+ var cl rune
+ cl, _, ss, err = unquoteChar(ss, q)
+ if cl < surLow || surEnd <= cl {
+ return "", errSurrogate
+ }
+ c = 0x10000 + (c-surHigh)*0x400 + (cl - surLow)
+ }
+
if err != nil {
return "", err
}
- // TODO: handle surrogates: if we have a left-surrogate, expect the
- // next value to be a right surrogate. Otherwise this is an error.
+
s = ss
if c < 0 {
if c == -2 {
@@ -192,6 +207,27 @@
return "", errUnmatchedQuote
}
+const (
+ surHigh = 0xD800
+ surLow = 0xDC00
+ surEnd = 0xE000
+)
+
+func isSimple(s string, quote rune) bool {
+ // TODO(perf): check if using a simple DFA to detect surrogate pairs is
+ // faster than converting to code points. At the very least there should
+ // be an ASCII fast path.
+ for _, r := range s {
+ if r == quote || r == '\\' {
+ return false
+ }
+ if surHigh <= r && r < surEnd {
+ return false
+ }
+ }
+ return true
+}
+
// contains reports whether the string contains the byte c.
func contains(s string, c byte) bool {
for i := 0; i < len(s); i++ {
@@ -238,6 +274,9 @@
}
return -1, false, "", nil
case c >= utf8.RuneSelf:
+ // TODO: consider handling surrogate values. These are discarded by
+ // DecodeRuneInString. It is technically correct to disallow it, but
+ // some JSON parsers allow this anyway.
r, size := utf8.DecodeRuneInString(s)
return r, true, s[size:], nil
case c != '\\':
diff --git a/cue/literal/string_test.go b/cue/literal/string_test.go
index dd00b64..88c392b 100644
--- a/cue/literal/string_test.go
+++ b/cue/literal/string_test.go
@@ -54,6 +54,11 @@
{`"\x04"`, "", errSyntax}, // not allowed in strings
{`'\U01230123'`, "", errSyntax}, // too large
+ // Surrogate pairs
+ {`"\uD834\uDD1E"`, "𝄞", nil},
+ {`"\uDD1E\uD834"`, "", errSurrogate},
+ {`"\uD834\uD834"`, "", errSurrogate},
+
{`"\\"`, "\\", nil},
{`"\'"`, "", errSyntax},
{`"\q"`, "", errSyntax},
diff --git a/cue/scanner/scanner.go b/cue/scanner/scanner.go
index c18e50a..03c9c19 100644
--- a/cue/scanner/scanner.go
+++ b/cue/scanner/scanner.go
@@ -414,6 +414,8 @@
// escaped quote. In case of a syntax error, it stops at the offending
// character (without consuming it) and returns false. Otherwise
// it returns true.
+//
+// Must be compliant with https://tools.ietf.org/html/rfc4627.
func (s *Scanner) scanEscape(quote quoteInfo) (ok, interpolation bool) {
for i := 0; i < quote.numHash; i++ {
if s.ch != '#' {
@@ -429,7 +431,7 @@
switch s.ch {
case '(':
return true, true
- case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote.char:
+ case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '/', quote.char:
s.next()
return true, false
case '0', '1', '2', '3', '4', '5', '6', '7':
@@ -470,7 +472,7 @@
// TODO: this is valid JSON, so remove, but normalize and report an error
// if for unmatched surrogate pairs .
- if x > max || 0xD800 <= x && x < 0xE000 {
+ if x > max {
s.error(offs, "escape sequence is invalid Unicode code point")
return false, false
}
diff --git a/cue/scanner/scanner_test.go b/cue/scanner/scanner_test.go
index d1a0402..18bb460 100644
--- a/cue/scanner/scanner_test.go
+++ b/cue/scanner/scanner_test.go
@@ -104,8 +104,10 @@
{token.STRING, "'\\000'", literal},
{token.STRING, "'\\xFF'", literal},
{token.STRING, "'\\uff16'", literal},
+ {token.STRING, "'\\uD801'", literal},
{token.STRING, "'\\U0000ff16'", literal},
{token.STRING, "'foobar'", literal},
+ {token.STRING, `'foo\/bar'`, literal},
{token.STRING, `#"foobar"#`, literal},
{token.STRING, `#"\r"#`, literal},
{token.STRING, `#"\("#`, literal},