cue: complete JSON conformance - accept escaped solidus - accept matched escaped surrogate pairs Closes #20 Change-Id: Ia04cb9b2ff9ff3006e4264d2eb32da71002a99ce Reviewed-on: https://cue-review.googlesource.com/c/cue/+/1791 Reviewed-by: Marcel van Lohuizen <mpvl@google.com>

commit: dd5b4bf7ffa3b9f40769bebfb1d284c47c82aae6 [log] [tgz]
author: Marcel van Lohuizen <mpvl@golang.org> Mon Apr 08 12:37:45 2019 +0200
committer: Marcel van Lohuizen <mpvl@google.com> Mon Apr 08 10:43:33 2019 +0000
tree: 42752bcd8fa1fd47c67addd0295778e91308a2ff
parent: 44ab013cd29bff7bbc9cf5736202fb4300abd36e [diff]
diff --git a/cue/literal/string.go b/cue/literal/string.go
index 3a7fea9..8a44800 100644
--- a/cue/literal/string.go
+++ b/cue/literal/string.go

@@ -28,6 +28,9 @@
 	errMissingNewline    = errors.New(
 		"invalid string: opening quote of multiline string must be followed by newline")
 	errUnmatchedQuote = errors.New("invalid string: unmatched quote")
+	// TODO: making this an error is optional according to RFC 4627. But we
+	// could make it not an error if this ever results in an issue.
+	errSurrogate = errors.New("unmatched surrogate pair")
 )
 
 // Unquote interprets s as a single- or double-quoted, single- or multi-line
@@ -124,18 +127,19 @@
 }
 
 // Unquote unquotes the given string. It must be terminated with a quote or an
-// interpolation start.
+// interpolation start. Escape sequences are expanded and surrogates
+// are replaced with the corresponding non-surrogate code points.
 func (q QuoteInfo) Unquote(s string) (string, error) {
 	if len(s) > 0 && !q.multiline {
 		if contains(s, '\n') || contains(s, '\r') {
 			return "", errSyntax
 		}
+
 		// Is it trivial? Avoid allocation.
-		if s[len(s)-1] == q.char &&
-			q.numHash == 0 &&
-			!contains(s, '\\') &&
-			!contains(s[:len(s)-1], q.char) {
-			return s[:len(s)-1], nil
+		if s[len(s)-1] == q.char && q.numHash == 0 {
+			if s := s[:len(s)-1]; isSimple(s, rune(q.char)) {
+				return s, nil
+			}
 		}
 	}
 
@@ -163,11 +167,22 @@
 			continue
 		}
 		c, multibyte, ss, err := unquoteChar(s, q)
+		if surHigh <= c && c < surEnd {
+			if c >= surLow {
+				return "", errSurrogate
+			}
+			var cl rune
+			cl, _, ss, err = unquoteChar(ss, q)
+			if cl < surLow || surEnd <= cl {
+				return "", errSurrogate
+			}
+			c = 0x10000 + (c-surHigh)*0x400 + (cl - surLow)
+		}
+
 		if err != nil {
 			return "", err
 		}
-		// TODO: handle surrogates: if we have a left-surrogate, expect the
-		// next value to be a right surrogate. Otherwise this is an error.
+
 		s = ss
 		if c < 0 {
 			if c == -2 {
@@ -192,6 +207,27 @@
 	return "", errUnmatchedQuote
 }
 
+const (
+	surHigh = 0xD800
+	surLow  = 0xDC00
+	surEnd  = 0xE000
+)
+
+func isSimple(s string, quote rune) bool {
+	// TODO(perf): check if using a simple DFA to detect surrogate pairs is
+	// faster than converting to code points. At the very least there should
+	// be an ASCII fast path.
+	for _, r := range s {
+		if r == quote || r == '\\' {
+			return false
+		}
+		if surHigh <= r && r < surEnd {
+			return false
+		}
+	}
+	return true
+}
+
 // contains reports whether the string contains the byte c.
 func contains(s string, c byte) bool {
 	for i := 0; i < len(s); i++ {
@@ -238,6 +274,9 @@
 		}
 		return -1, false, "", nil
 	case c >= utf8.RuneSelf:
+		// TODO: consider handling surrogate values. These are discarded by
+		// DecodeRuneInString. It is technically correct to disallow it, but
+		// some JSON parsers allow this anyway.
 		r, size := utf8.DecodeRuneInString(s)
 		return r, true, s[size:], nil
 	case c != '\\':

diff --git a/cue/literal/string_test.go b/cue/literal/string_test.go
index dd00b64..88c392b 100644
--- a/cue/literal/string_test.go
+++ b/cue/literal/string_test.go

@@ -54,6 +54,11 @@
 		{`"\x04"`, "", errSyntax},       // not allowed in strings
 		{`'\U01230123'`, "", errSyntax}, // too large
 
+		// Surrogate pairs
+		{`"\uD834\uDD1E"`, "𝄞", nil},
+		{`"\uDD1E\uD834"`, "", errSurrogate},
+		{`"\uD834\uD834"`, "", errSurrogate},
+
 		{`"\\"`, "\\", nil},
 		{`"\'"`, "", errSyntax},
 		{`"\q"`, "", errSyntax},

diff --git a/cue/scanner/scanner.go b/cue/scanner/scanner.go
index c18e50a..03c9c19 100644
--- a/cue/scanner/scanner.go
+++ b/cue/scanner/scanner.go

@@ -414,6 +414,8 @@
 // escaped quote. In case of a syntax error, it stops at the offending
 // character (without consuming it) and returns false. Otherwise
 // it returns true.
+//
+// Must be compliant with https://tools.ietf.org/html/rfc4627.
 func (s *Scanner) scanEscape(quote quoteInfo) (ok, interpolation bool) {
 	for i := 0; i < quote.numHash; i++ {
 		if s.ch != '#' {
@@ -429,7 +431,7 @@
 	switch s.ch {
 	case '(':
 		return true, true
-	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote.char:
+	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '/', quote.char:
 		s.next()
 		return true, false
 	case '0', '1', '2', '3', '4', '5', '6', '7':
@@ -470,7 +472,7 @@
 
 	// TODO: this is valid JSON, so remove, but normalize and report an error
 	// if for unmatched surrogate pairs .
-	if x > max || 0xD800 <= x && x < 0xE000 {
+	if x > max {
 		s.error(offs, "escape sequence is invalid Unicode code point")
 		return false, false
 	}

diff --git a/cue/scanner/scanner_test.go b/cue/scanner/scanner_test.go
index d1a0402..18bb460 100644
--- a/cue/scanner/scanner_test.go
+++ b/cue/scanner/scanner_test.go

@@ -104,8 +104,10 @@
 	{token.STRING, "'\\000'", literal},
 	{token.STRING, "'\\xFF'", literal},
 	{token.STRING, "'\\uff16'", literal},
+	{token.STRING, "'\\uD801'", literal},
 	{token.STRING, "'\\U0000ff16'", literal},
 	{token.STRING, "'foobar'", literal},
+	{token.STRING, `'foo\/bar'`, literal},
 	{token.STRING, `#"foobar"#`, literal},
 	{token.STRING, `#"\r"#`, literal},
 	{token.STRING, `#"\("#`, literal},
commit	dd5b4bf7ffa3b9f40769bebfb1d284c47c82aae6	[log] [tgz]
author	Marcel van Lohuizen <mpvl@golang.org>	Mon Apr 08 12:37:45 2019 +0200
committer	Marcel van Lohuizen <mpvl@google.com>	Mon Apr 08 10:43:33 2019 +0000
tree	42752bcd8fa1fd47c67addd0295778e91308a2ff
parent	44ab013cd29bff7bbc9cf5736202fb4300abd36e [diff]