Merge "cue: hoist literal string parsing into new package"

commit: f54e594863f0cbd75a4cc04ab0c6dfd4dcdb4932 [log] [tgz]
author: Marcel van Lohuizen <mpvl@google.com> Tue Feb 19 01:28:58 2019 +0000
committer: Gerrit Code Review <noreply-gerritcodereview@google.com> Tue Feb 19 01:28:58 2019 +0000
tree: 6f70def4af53efedecd5be51a5022a18fafd0479
parent: bbdfb896e3cc2248b69b17fb2cd875d852c0c28b [diff]
parent: 6ceb601c732b2a118a149a28786835412f4bfb2b [diff]
diff --git a/cmd/cue/cmd/import.go b/cmd/cue/cmd/import.go
index 1d988b5..4b975ba 100644
--- a/cmd/cue/cmd/import.go
+++ b/cmd/cue/cmd/import.go

@@ -32,6 +32,7 @@
 	"cuelang.org/go/cue/ast"
 	"cuelang.org/go/cue/encoding"
 	"cuelang.org/go/cue/format"
+	"cuelang.org/go/cue/literal"
 	"cuelang.org/go/cue/load"
 	"cuelang.org/go/cue/parser"
 	"cuelang.org/go/cue/token"
@@ -658,7 +659,7 @@
 				continue
 			}
 
-			str, err := cue.Unquote(lit.Value)
+			str, err := literal.Unquote(lit.Value)
 			if err != nil {
 				continue
 			}

diff --git a/cue/ast.go b/cue/ast.go
index 0f3086d..3ca7fd5 100644
--- a/cue/ast.go
+++ b/cue/ast.go

@@ -16,11 +16,11 @@
 
 import (
 	"fmt"
-	"strconv"
 	"strings"
 
 	"cuelang.org/go/cue/ast"
 	"cuelang.org/go/cue/build"
+	"cuelang.org/go/cue/literal"
 	"cuelang.org/go/cue/token"
 )
 
@@ -131,7 +131,7 @@
 	if !isBottom(val) {
 		return val
 	}
-	path, err := strconv.Unquote(imp.Path.Value)
+	path, err := literal.Unquote(imp.Path.Value)
 	if err != nil {
 		return ctx.mkErr(newNode(imp), "illformed import spec")
 	}
@@ -383,7 +383,7 @@
 		}
 		lit := &interpolation{baseValue: newExpr(n), k: stringKind}
 		value = lit
-		info, prefixLen, _, err := ParseQuotes(first.Value, last.Value)
+		info, prefixLen, _, err := literal.ParseQuotes(first.Value, last.Value)
 		if err != nil {
 			return v.error(n, "invalid interpolation: %v", err)
 		}

diff --git a/cue/builtin.go b/cue/builtin.go
index b6fdd17..39c3fad 100644
--- a/cue/builtin.go
+++ b/cue/builtin.go

@@ -24,10 +24,10 @@
 	"path"
 	"reflect"
 	"sort"
-	"strconv"
 	"strings"
 
 	"cuelang.org/go/cue/ast"
+	"cuelang.org/go/cue/literal"
 	"github.com/cockroachdb/apd"
 )
 
@@ -349,7 +349,7 @@
 
 // lookupBuiltinPkg returns the builtin package for the given path if it exists.
 func lookupBuiltinPkg(ctx *context, imp *ast.ImportSpec) evaluated {
-	path, err := strconv.Unquote(imp.Path.Value)
+	path, err := literal.Unquote(imp.Path.Value)
 	if err != nil {
 		return ctx.mkErr(newNode(imp), "illformed import spec")
 	}

diff --git a/cue/builtins.go b/cue/builtins.go
index f16d702..51162d5 100644
--- a/cue/builtins.go
+++ b/cue/builtins.go

@@ -25,6 +25,7 @@
 	"text/template"
 	"unicode"
 
+	"cuelang.org/go/cue/literal"
 	"cuelang.org/go/cue/parser"
 	"cuelang.org/go/cue/token"
 	"cuelang.org/go/internal/third_party/yaml"
@@ -1283,7 +1284,7 @@
 		Func: func(c *callCtxt) {
 			s := c.string(0)
 			c.ret, c.err = func() (interface{}, error) {
-				return Unquote(s)
+				return literal.Unquote(s)
 			}()
 		},
 	}, {

diff --git a/cue/lit.go b/cue/lit.go
index e524b1b..74e3ccd 100644
--- a/cue/lit.go
+++ b/cue/lit.go

@@ -16,12 +16,10 @@
 
 import (
 	"math/big"
-	"strings"
-	"unicode"
-	"unicode/utf8"
 
 	"cuelang.org/go/cue/ast"
 	"cuelang.org/go/cue/errors"
+	"cuelang.org/go/cue/literal"
 	"github.com/cockroachdb/apd"
 )
 
@@ -34,267 +32,6 @@
 
 var errInvalidString = errors.New("invalid string")
 
-// Unquote interprets s as a single- or double-quoted, single- or multi-line
-// string, possibly with custom escape delimiters, returning the string value
-// that s quotes.
-func Unquote(s string) (string, error) {
-	info, nStart, _, err := ParseQuotes(s, s)
-	if err != nil {
-		return "", err
-	}
-	s = s[nStart:]
-	return info.Unquote(s)
-}
-
-// Unquote unquotes the given string. It must be terminated with a quote or an
-// interpolation start.
-func (q QuoteInfo) Unquote(s string) (string, error) {
-	if len(s) > 0 && !q.multiline {
-		if contains(s, '\n') || contains(s, '\r') {
-			return "", errSyntax
-		}
-		// Is it trivial? Avoid allocation.
-		if s[len(s)-1] == q.char &&
-			q.numHash == 0 &&
-			!contains(s, '\\') &&
-			!contains(s[:len(s)-1], q.char) {
-			return s[:len(s)-1], nil
-		}
-	}
-
-	var runeTmp [utf8.UTFMax]byte
-	buf := make([]byte, 0, 3*len(s)/2) // Try to avoid more allocations.
-	stripNL := false
-	for len(s) > 0 {
-		switch s[0] {
-		case '\r':
-			s = s[1:]
-			continue
-		case '\n':
-			switch {
-			case !q.multiline:
-				fallthrough
-			default:
-				return "", errInvalidWhitespace
-			case strings.HasPrefix(s[1:], q.whitespace):
-				s = s[1+len(q.whitespace):]
-			case strings.HasPrefix(s[1:], "\n"):
-				s = s[1:]
-			}
-			stripNL = true
-			buf = append(buf, '\n')
-			continue
-		}
-		c, multibyte, ss, err := unquoteChar(s, q)
-		if err != nil {
-			return "", err
-		}
-		// TODO: handle surrogates: if we have a left-surrogate, expect the
-		// next value to be a right surrogate. Otherwise this is an error.
-		s = ss
-		if c < 0 {
-			if c == -2 {
-				stripNL = false
-			}
-			if stripNL {
-				// Strip the last newline, but only if it came from a closing
-				// quote.
-				buf = buf[:len(buf)-1]
-			}
-			return string(buf), nil
-		}
-		stripNL = false
-		if c < utf8.RuneSelf || !multibyte {
-			buf = append(buf, byte(c))
-		} else {
-			n := utf8.EncodeRune(runeTmp[:], c)
-			buf = append(buf, runeTmp[:n]...)
-		}
-	}
-	// allow unmatched quotes if already checked.
-	return "", errUnmatchedQuote
-}
-
-// contains reports whether the string contains the byte c.
-func contains(s string, c byte) bool {
-	for i := 0; i < len(s); i++ {
-		if s[i] == c {
-			return true
-		}
-	}
-	return false
-}
-
-// unquoteChar decodes the first character or byte in the escaped string.
-// It returns four values:
-//
-//	1) value, the decoded Unicode code point or byte value; the special value
-//     of -1 indicates terminated by quotes and -2 means terminated by \(.
-//	2) multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation;
-//	3) tail, the remainder of the string after the character; and
-//	4) an error that will be nil if the character is syntactically valid.
-//
-// The second argument, kind, specifies the type of literal being parsed
-// and therefore which kind of escape sequences are permitted.
-// For kind 's' only JSON escapes and \u{ are permitted.
-// For kind 'b' also hexadecimal and octal escape sequences are permitted.
-//
-// The third argument, quote, specifies that an ASCII quoting character that
-// is not permitted in the output.
-func unquoteChar(s string, info QuoteInfo) (value rune, multibyte bool, tail string, err error) {
-	// easy cases
-	switch c := s[0]; {
-	case c == info.char && info.char != 0:
-		for i := 1; byte(i) < info.numChar; i++ {
-			if i >= len(s) || s[i] != info.char {
-				return rune(info.char), false, s[1:], nil
-			}
-		}
-		for i := 0; i < info.numHash; i++ {
-			if i+int(info.numChar) >= len(s) || s[i+int(info.numChar)] != '#' {
-				return rune(info.char), false, s[1:], nil
-			}
-		}
-		if ln := int(info.numChar) + info.numHash; len(s) != ln {
-			// TODO: terminating quote in middle of string
-			return 0, false, s[ln:], errSyntax
-		}
-		return -1, false, "", nil
-	case c >= utf8.RuneSelf:
-		r, size := utf8.DecodeRuneInString(s)
-		return r, true, s[size:], nil
-	case c != '\\':
-		return rune(s[0]), false, s[1:], nil
-	}
-
-	if len(s) <= 1+info.numHash {
-		return '\\', false, s[1:], nil
-	}
-	for i := 1; i <= info.numHash && i < len(s); i++ {
-		if s[i] != '#' {
-			return '\\', false, s[1:], nil
-		}
-	}
-
-	c := s[1+info.numHash]
-	s = s[2+info.numHash:]
-
-	switch c {
-	case 'a':
-		value = '\a'
-	case 'b':
-		value = '\b'
-	case 'f':
-		value = '\f'
-	case 'n':
-		value = '\n'
-	case 'r':
-		value = '\r'
-	case 't':
-		value = '\t'
-	case 'v':
-		value = '\v'
-	case '/':
-		value = '/'
-	case 'x', 'u', 'U':
-		n := 0
-		switch c {
-		case 'x':
-			n = 2
-		case 'u':
-			n = 4
-		case 'U':
-			n = 8
-		}
-		var v rune
-		if len(s) < n {
-			err = errSyntax
-			return
-		}
-		for j := 0; j < n; j++ {
-			x, ok := unhex(s[j])
-			if !ok {
-				err = errSyntax
-				return
-			}
-			v = v<<4 | x
-		}
-		s = s[n:]
-		if c == 'x' {
-			if info.char == '"' {
-				err = errSyntax
-				return
-			}
-			// single-byte string, possibly not UTF-8
-			value = v
-			break
-		}
-		if v > utf8.MaxRune {
-			err = errSyntax
-			return
-		}
-		value = v
-		multibyte = true
-	case '0', '1', '2', '3', '4', '5', '6', '7':
-		if info.char == '"' {
-			err = errSyntax
-			return
-		}
-		v := rune(c) - '0'
-		if len(s) < 2 {
-			err = errSyntax
-			return
-		}
-		for j := 0; j < 2; j++ { // one digit already; two more
-			x := rune(s[j]) - '0'
-			if x < 0 || x > 7 {
-				err = errSyntax
-				return
-			}
-			v = (v << 3) | x
-		}
-		s = s[2:]
-		if v > 255 {
-			err = errSyntax
-			return
-		}
-		value = v
-	case '\\':
-		value = '\\'
-	case '\'', '"':
-		// TODO: should we allow escaping of quotes regardless?
-		if c != info.char {
-			err = errSyntax
-			return
-		}
-		value = rune(c)
-	case '(':
-		if s != "" {
-			// TODO: terminating quote in middle of string
-			return 0, false, s, errSyntax
-		}
-		value = -2
-	default:
-		err = errSyntax
-		return
-	}
-	tail = s
-	return
-}
-
-func unhex(b byte) (v rune, ok bool) {
-	c := rune(b)
-	switch {
-	case '0' <= c && c <= '9':
-		return c - '0', true
-	case 'a' <= c && c <= 'f':
-		return c - 'a' + 10, true
-	case 'A' <= c && c <= 'F':
-		return c - 'A' + 10, true
-	}
-	return
-}
-
 type numInfo struct {
 	rep multiplier
 	k   kind
@@ -426,7 +163,7 @@
 	}
 	switch p.ch {
 	case '"', '\'', '`', '#':
-		info, nStart, _, err := ParseQuotes(s, s)
+		info, nStart, _, err := literal.ParseQuotes(s, s)
 		if err != nil {
 			return p.error(l, err.Error())
 		}
@@ -447,100 +184,8 @@
 	return n
 }
 
-var (
-	errStringTooShort    = errors.New("invalid string: too short")
-	errInvalidWhitespace = errors.New("invalid string: invalid whitespace")
-	errMissingNewline    = errors.New(
-		"invalid string: opening quote of multiline string must be followed by newline")
-	errUnmatchedQuote = errors.New("invalid string: unmatched quote")
-)
-
-// QuoteInfo describes the type of quotes used for a string.
-type QuoteInfo struct {
-	quote      string
-	whitespace string
-	numHash    int
-	multiline  bool
-	char       byte
-	numChar    byte
-}
-
-// IsDouble reports whether the literal uses double quotes.
-func (q QuoteInfo) IsDouble() bool {
-	return q.char == '"'
-}
-
-// ParseQuotes checks if the opening quotes in start matches the ending quotes
-// in end and reports its type as q or an error if they do not matching or are
-// invalid. nStart indicates the number of bytes used for the opening quote.
-func ParseQuotes(start, end string) (q QuoteInfo, nStart, nEnd int, err error) {
-	for i, c := range start {
-		if c != '#' {
-			break
-		}
-		q.numHash = i + 1
-	}
-	if len(start) < 2+2*q.numHash {
-		return q, 0, 0, errStringTooShort
-	}
-	s := start[q.numHash:]
-	switch s[0] {
-	case '"', '\'':
-		q.char = s[0]
-		if len(s) > 3 && s[1] == s[0] && s[2] == s[0] {
-			switch s[3] {
-			case '\n':
-				q.quote = start[:3+q.numHash]
-			case '\r':
-				if len(s) > 4 && s[4] == '\n' {
-					q.quote = start[:4+q.numHash]
-					break
-				}
-				fallthrough
-			default:
-				return q, 0, 0, errMissingNewline
-			}
-			q.multiline = true
-			q.numChar = 3
-			nStart = len(q.quote) + 1 // add whitespace later
-		} else {
-			q.quote = start[:1+q.numHash]
-			q.numChar = 1
-			nStart = len(q.quote)
-		}
-	default:
-		return q, 0, 0, errSyntax
-	}
-	quote := start[:int(q.numChar)+q.numHash]
-	for i := 0; i < len(quote); i++ {
-		if j := len(end) - i - 1; j < 0 || quote[i] != end[j] {
-			return q, 0, 0, errUnmatchedQuote
-		}
-	}
-	if q.multiline {
-		i := len(end) - len(quote)
-		for i > 0 {
-			r, size := utf8.DecodeLastRuneInString(end[:i])
-			if r == '\n' || !unicode.IsSpace(r) {
-				break
-			}
-			i -= size
-		}
-		q.whitespace = end[i : len(end)-len(quote)]
-
-		if len(start) > nStart && start[nStart] != '\n' {
-			if !strings.HasPrefix(start[nStart:], q.whitespace) {
-				return q, 0, 0, errInvalidWhitespace
-			}
-			nStart += len(q.whitespace)
-		}
-	}
-
-	return q, nStart, int(q.numChar) + q.numHash, nil
-}
-
 // parseString decodes a string without the starting and ending quotes.
-func parseString(ctx *context, node ast.Expr, q QuoteInfo, s string) (n value) {
+func parseString(ctx *context, node ast.Expr, q literal.QuoteInfo, s string) (n value) {
 	src := newExpr(node)
 	str, err := q.Unquote(s)
 	if err != nil {

diff --git a/cue/lit_test.go b/cue/lit_test.go
index 8263be2..5c3c7f7 100644
--- a/cue/lit_test.go
+++ b/cue/lit_test.go

@@ -25,150 +25,6 @@
 	"github.com/google/go-cmp/cmp/cmpopts"
 )
 
-func TestUnquote(t *testing.T) {
-	testCases := []struct {
-		in, out string
-		err     error
-	}{
-		{`"Hello"`, "Hello", nil},
-		{`'Hello'`, "Hello", nil},
-		{`'Hellø'`, "Hellø", nil},
-		{`"""` + "\n\t\tHello\n\t\t" + `"""`, "Hello", nil},
-		{"'''\n\t\tHello\n\t\t'''", "Hello", nil},
-		{"'''\n\t\tHello\n\n\t\t'''", "Hello\n", nil},
-		{"'''\n\n\t\tHello\n\t\t'''", "\nHello", nil},
-		{"'''\n\n\n\n\t\t'''", "\n\n", nil},
-		{"'''\n\t\t'''", "", nil},
-		{`"""` + "\n\raaa\n\rbbb\n\r" + `"""`, "aaa\nbbb", nil},
-		{`'\a\b\f\n\r\t\v\'\\\/'`, "\a\b\f\n\r\t\v'\\/", nil},
-		{`"\a\b\f\n\r\t\v\"\\\/"`, "\a\b\f\n\r\t\v\"\\/", nil},
-		{`#"The sequence "\U0001F604" renders as \#U0001F604."#`,
-			`The sequence "\U0001F604" renders as 😄.`,
-			nil},
-		{`"  \U00010FfF"`, "  \U00010fff", nil},
-		{`"\u0061 "`, "a ", nil},
-		{`'\x61\x55'`, "\x61\x55", nil},
-		{`'\061\055'`, "\061\055", nil},
-		{`'\377 '`, "\377 ", nil},
-		{"'e\u0300\\n'", "e\u0300\n", nil},
-		{`'\06\055'`, "", errSyntax},
-		{`'\0'`, "", errSyntax},
-		{`"\06\055"`, "", errSyntax},    // too short
-		{`'\777 '`, "", errSyntax},      // overflow
-		{`'\U012301'`, "", errSyntax},   // too short
-		{`'\U0123012G'`, "", errSyntax}, // invalid digit G
-		{`"\x04"`, "", errSyntax},       // not allowed in strings
-		{`'\U01230123'`, "", errSyntax}, // too large
-
-		{`"\\"`, "\\", nil},
-		{`"\'"`, "", errSyntax},
-		{`"\q"`, "", errSyntax},
-		{"'\n'", "", errSyntax},
-		{"'---\n---'", "", errSyntax},
-		{"'''\r'''", "", errMissingNewline},
-
-		{`#"Hello"#`, "Hello", nil},
-		{`#"Hello\v"#`, "Hello\\v", nil},
-		{`#"Hello\#v\r"#`, "Hello\v\\r", nil},
-		{`##"Hello\##v\r"##`, "Hello\v\\r", nil},
-		{`##"Hello\##v"##`, "Hello\v", nil},
-		{"#'''\n\t\tHello\\#v\n\t\t'''#", "Hello\v", nil},
-		{"##'''\n\t\tHello\\#v\n\t\t'''##", "Hello\\#v", nil},
-		{`#"""` + "\n\t\t\\#r\n\t\t" + `"""#`, "\r", nil},
-		{`#""#`, "", nil},
-		{`#"This is a "dog""#`, `This is a "dog"`, nil},
-		{"#\"\"\"\n\"\n\"\"\"#", `"`, nil},
-		{"#\"\"\"\n\"\"\"\n\"\"\"#", `"""`, nil},
-		{"#\"\"\"\n\na\n\n\"\"\"#", "\na\n", nil},
-		// Gobble extra \r
-		{"#\"\"\"\n\ra\n\r\"\"\"#", `a`, nil},
-		{"#\"\"\"\n\r\n\ra\n\r\n\r\"\"\"#", "\na\n", nil},
-		// Make sure this works for Windows.
-		{"#\"\"\"\r\n\r\na\r\n\r\n\"\"\"#", "\na\n", nil},
-		{"#\"\"\"\r\n \r\n a\r\n \r\n \"\"\"#", "\na\n", nil},
-		{"#\"\"\"\r\na\r\n\"\"\"#", `a`, nil},
-		{"#\"\"\"\r\n\ra\r\n\r\"\"\"#", `a`, nil},
-		{`####"   \"####`, `   \`, nil},
-
-		{"```", "", errSyntax},
-		{"Hello", "", errSyntax},
-		{`"Hello`, "", errUnmatchedQuote},
-		{`"""Hello"""`, "", errMissingNewline},
-		{"'''\n  Hello\n   '''", "", errInvalidWhitespace},
-		{"'''\n   a\n  b\n   '''", "", errInvalidWhitespace},
-		{`"Hello""`, "", errSyntax},
-		{`#"Hello"`, "", errUnmatchedQuote},
-		{`#"Hello'#`, "", errUnmatchedQuote},
-		{`#"""#`, "", errMissingNewline},
-
-		// TODO: should these be legal?
-		{`#"""#`, "", errMissingNewline},
-	}
-	for i, tc := range testCases {
-		t.Run(fmt.Sprintf("%d/%s", i, tc.in), func(t *testing.T) {
-			if got, err := Unquote(tc.in); err != tc.err {
-				t.Errorf("error: got %q; want %q", err, tc.err)
-			} else if got != tc.out {
-				t.Errorf("value: got %q; want %q", got, tc.out)
-			}
-		})
-	}
-}
-
-func TestInterpolation(t *testing.T) {
-	testCases := []struct {
-		quotes string
-		in     string
-		out    string
-		err    error
-	}{
-		{`""`, `foo\(`, "foo", nil},
-		{`"""` + "\n" + `"""`, `foo`, "", errUnmatchedQuote},
-		{`#""#`, `foo\#(`, "foo", nil},
-		{`#""#`, `foo\(`, "", errUnmatchedQuote},
-		{`""`, `foo\(bar`, "", errSyntax},
-		{`""`, ``, "", errUnmatchedQuote},
-		{`#""#`, `"`, "", errUnmatchedQuote},
-		{`#""#`, `\`, "", errUnmatchedQuote},
-		{`##""##`, `\'`, "", errUnmatchedQuote},
-	}
-	for i, tc := range testCases {
-		t.Run(fmt.Sprintf("%d/%s/%s", i, tc.quotes, tc.in), func(t *testing.T) {
-			info, _, _, _ := ParseQuotes(tc.quotes, tc.quotes)
-			if got, err := info.Unquote(tc.in); err != tc.err {
-				t.Errorf("error: got %q; want %q", err, tc.err)
-			} else if got != tc.out {
-				t.Errorf("value: got %q; want %q", got, tc.out)
-			}
-		})
-	}
-}
-
-func TestIsDouble(t *testing.T) {
-	testCases := []struct {
-		quotes string
-		double bool
-	}{
-		{`""`, true},
-		{`"""` + "\n" + `"""`, true},
-		{`#""#`, true},
-		{`''`, false},
-		{`'''` + "\n" + `'''`, false},
-		{`#''#`, false},
-	}
-	for i, tc := range testCases {
-		t.Run(fmt.Sprintf("%d/%s", i, tc.quotes), func(t *testing.T) {
-			info, _, _, err := ParseQuotes(tc.quotes, tc.quotes)
-			if err != nil {
-				t.Fatal(err)
-			}
-			if got := info.IsDouble(); got != tc.double {
-				t.Errorf("got %v; want %v", got, tc.double)
-			}
-		})
-	}
-}
-
 var defIntBase = newNumBase(&ast.BasicLit{}, newNumInfo(numKind, 0, 10, false))
 var defRatBase = newNumBase(&ast.BasicLit{}, newNumInfo(floatKind, 0, 10, false))
 

diff --git a/cue/literal/doc.go b/cue/literal/doc.go
new file mode 100644
index 0000000..3d3095c
--- /dev/null
+++ b/cue/literal/doc.go

@@ -0,0 +1,17 @@
+// Copyright 2019 CUE Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package literal implements conversions to and from string representations of
+// basic data types.
+package literal

diff --git a/cue/literal/string.go b/cue/literal/string.go
new file mode 100644
index 0000000..3a7fea9
--- /dev/null
+++ b/cue/literal/string.go

@@ -0,0 +1,373 @@
+// Copyright 2019 CUE Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package literal
+
+import (
+	"errors"
+	"strings"
+	"unicode"
+	"unicode/utf8"
+)
+
+var (
+	errSyntax            = errors.New("invalid syntax")
+	errInvalidString     = errors.New("invalid string")
+	errInvalidWhitespace = errors.New("invalid string: invalid whitespace")
+	errMissingNewline    = errors.New(
+		"invalid string: opening quote of multiline string must be followed by newline")
+	errUnmatchedQuote = errors.New("invalid string: unmatched quote")
+)
+
+// Unquote interprets s as a single- or double-quoted, single- or multi-line
+// string, possibly with custom escape delimiters, returning the string value
+// that s quotes.
+func Unquote(s string) (string, error) {
+	info, nStart, _, err := ParseQuotes(s, s)
+	if err != nil {
+		return "", err
+	}
+	s = s[nStart:]
+	return info.Unquote(s)
+}
+
+// QuoteInfo describes the type of quotes used for a string.
+type QuoteInfo struct {
+	quote      string
+	whitespace string
+	numHash    int
+	multiline  bool
+	char       byte
+	numChar    byte
+}
+
+// IsDouble reports whether the literal uses double quotes.
+func (q QuoteInfo) IsDouble() bool {
+	return q.char == '"'
+}
+
+// ParseQuotes checks if the opening quotes in start matches the ending quotes
+// in end and reports its type as q or an error if they do not matching or are
+// invalid. nStart indicates the number of bytes used for the opening quote.
+func ParseQuotes(start, end string) (q QuoteInfo, nStart, nEnd int, err error) {
+	for i, c := range start {
+		if c != '#' {
+			break
+		}
+		q.numHash = i + 1
+	}
+	s := start[q.numHash:]
+	switch s[0] {
+	case '"', '\'':
+		q.char = s[0]
+		if len(s) > 3 && s[1] == s[0] && s[2] == s[0] {
+			switch s[3] {
+			case '\n':
+				q.quote = start[:3+q.numHash]
+			case '\r':
+				if len(s) > 4 && s[4] == '\n' {
+					q.quote = start[:4+q.numHash]
+					break
+				}
+				fallthrough
+			default:
+				return q, 0, 0, errMissingNewline
+			}
+			q.multiline = true
+			q.numChar = 3
+			nStart = len(q.quote) + 1 // add whitespace later
+		} else {
+			q.quote = start[:1+q.numHash]
+			q.numChar = 1
+			nStart = len(q.quote)
+		}
+	default:
+		return q, 0, 0, errSyntax
+	}
+	quote := start[:int(q.numChar)+q.numHash]
+	for i := 0; i < len(quote); i++ {
+		if j := len(end) - i - 1; j < 0 || quote[i] != end[j] {
+			return q, 0, 0, errUnmatchedQuote
+		}
+	}
+	if q.multiline {
+		i := len(end) - len(quote)
+		for i > 0 {
+			r, size := utf8.DecodeLastRuneInString(end[:i])
+			if r == '\n' || !unicode.IsSpace(r) {
+				break
+			}
+			i -= size
+		}
+		q.whitespace = end[i : len(end)-len(quote)]
+
+		if len(start) > nStart && start[nStart] != '\n' {
+			if !strings.HasPrefix(start[nStart:], q.whitespace) {
+				return q, 0, 0, errInvalidWhitespace
+			}
+			nStart += len(q.whitespace)
+		}
+	}
+
+	return q, nStart, int(q.numChar) + q.numHash, nil
+}
+
+// Unquote unquotes the given string. It must be terminated with a quote or an
+// interpolation start.
+func (q QuoteInfo) Unquote(s string) (string, error) {
+	if len(s) > 0 && !q.multiline {
+		if contains(s, '\n') || contains(s, '\r') {
+			return "", errSyntax
+		}
+		// Is it trivial? Avoid allocation.
+		if s[len(s)-1] == q.char &&
+			q.numHash == 0 &&
+			!contains(s, '\\') &&
+			!contains(s[:len(s)-1], q.char) {
+			return s[:len(s)-1], nil
+		}
+	}
+
+	var runeTmp [utf8.UTFMax]byte
+	buf := make([]byte, 0, 3*len(s)/2) // Try to avoid more allocations.
+	stripNL := false
+	for len(s) > 0 {
+		switch s[0] {
+		case '\r':
+			s = s[1:]
+			continue
+		case '\n':
+			switch {
+			case !q.multiline:
+				fallthrough
+			default:
+				return "", errInvalidWhitespace
+			case strings.HasPrefix(s[1:], q.whitespace):
+				s = s[1+len(q.whitespace):]
+			case strings.HasPrefix(s[1:], "\n"):
+				s = s[1:]
+			}
+			stripNL = true
+			buf = append(buf, '\n')
+			continue
+		}
+		c, multibyte, ss, err := unquoteChar(s, q)
+		if err != nil {
+			return "", err
+		}
+		// TODO: handle surrogates: if we have a left-surrogate, expect the
+		// next value to be a right surrogate. Otherwise this is an error.
+		s = ss
+		if c < 0 {
+			if c == -2 {
+				stripNL = false
+			}
+			if stripNL {
+				// Strip the last newline, but only if it came from a closing
+				// quote.
+				buf = buf[:len(buf)-1]
+			}
+			return string(buf), nil
+		}
+		stripNL = false
+		if c < utf8.RuneSelf || !multibyte {
+			buf = append(buf, byte(c))
+		} else {
+			n := utf8.EncodeRune(runeTmp[:], c)
+			buf = append(buf, runeTmp[:n]...)
+		}
+	}
+	// allow unmatched quotes if already checked.
+	return "", errUnmatchedQuote
+}
+
+// contains reports whether the string contains the byte c.
+func contains(s string, c byte) bool {
+	for i := 0; i < len(s); i++ {
+		if s[i] == c {
+			return true
+		}
+	}
+	return false
+}
+
+// unquoteChar decodes the first character or byte in the escaped string.
+// It returns four values:
+//
+//	1) value, the decoded Unicode code point or byte value; the special value
+//     of -1 indicates terminated by quotes and -2 means terminated by \(.
+//	2) multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation;
+//	3) tail, the remainder of the string after the character; and
+//	4) an error that will be nil if the character is syntactically valid.
+//
+// The second argument, kind, specifies the type of literal being parsed
+// and therefore which kind of escape sequences are permitted.
+// For kind 's' only JSON escapes and \u{ are permitted.
+// For kind 'b' also hexadecimal and octal escape sequences are permitted.
+//
+// The third argument, quote, specifies that an ASCII quoting character that
+// is not permitted in the output.
+func unquoteChar(s string, info QuoteInfo) (value rune, multibyte bool, tail string, err error) {
+	// easy cases
+	switch c := s[0]; {
+	case c == info.char && info.char != 0:
+		for i := 1; byte(i) < info.numChar; i++ {
+			if i >= len(s) || s[i] != info.char {
+				return rune(info.char), false, s[1:], nil
+			}
+		}
+		for i := 0; i < info.numHash; i++ {
+			if i+int(info.numChar) >= len(s) || s[i+int(info.numChar)] != '#' {
+				return rune(info.char), false, s[1:], nil
+			}
+		}
+		if ln := int(info.numChar) + info.numHash; len(s) != ln {
+			// TODO: terminating quote in middle of string
+			return 0, false, s[ln:], errSyntax
+		}
+		return -1, false, "", nil
+	case c >= utf8.RuneSelf:
+		r, size := utf8.DecodeRuneInString(s)
+		return r, true, s[size:], nil
+	case c != '\\':
+		return rune(s[0]), false, s[1:], nil
+	}
+
+	if len(s) <= 1+info.numHash {
+		return '\\', false, s[1:], nil
+	}
+	for i := 1; i <= info.numHash && i < len(s); i++ {
+		if s[i] != '#' {
+			return '\\', false, s[1:], nil
+		}
+	}
+
+	c := s[1+info.numHash]
+	s = s[2+info.numHash:]
+
+	switch c {
+	case 'a':
+		value = '\a'
+	case 'b':
+		value = '\b'
+	case 'f':
+		value = '\f'
+	case 'n':
+		value = '\n'
+	case 'r':
+		value = '\r'
+	case 't':
+		value = '\t'
+	case 'v':
+		value = '\v'
+	case '/':
+		value = '/'
+	case 'x', 'u', 'U':
+		n := 0
+		switch c {
+		case 'x':
+			n = 2
+		case 'u':
+			n = 4
+		case 'U':
+			n = 8
+		}
+		var v rune
+		if len(s) < n {
+			err = errSyntax
+			return
+		}
+		for j := 0; j < n; j++ {
+			x, ok := unhex(s[j])
+			if !ok {
+				err = errSyntax
+				return
+			}
+			v = v<<4 | x
+		}
+		s = s[n:]
+		if c == 'x' {
+			if info.char == '"' {
+				err = errSyntax
+				return
+			}
+			// single-byte string, possibly not UTF-8
+			value = v
+			break
+		}
+		if v > utf8.MaxRune {
+			err = errSyntax
+			return
+		}
+		value = v
+		multibyte = true
+	case '0', '1', '2', '3', '4', '5', '6', '7':
+		if info.char == '"' {
+			err = errSyntax
+			return
+		}
+		v := rune(c) - '0'
+		if len(s) < 2 {
+			err = errSyntax
+			return
+		}
+		for j := 0; j < 2; j++ { // one digit already; two more
+			x := rune(s[j]) - '0'
+			if x < 0 || x > 7 {
+				err = errSyntax
+				return
+			}
+			v = (v << 3) | x
+		}
+		s = s[2:]
+		if v > 255 {
+			err = errSyntax
+			return
+		}
+		value = v
+	case '\\':
+		value = '\\'
+	case '\'', '"':
+		// TODO: should we allow escaping of quotes regardless?
+		if c != info.char {
+			err = errSyntax
+			return
+		}
+		value = rune(c)
+	case '(':
+		if s != "" {
+			// TODO: terminating quote in middle of string
+			return 0, false, s, errSyntax
+		}
+		value = -2
+	default:
+		err = errSyntax
+		return
+	}
+	tail = s
+	return
+}
+
+func unhex(b byte) (v rune, ok bool) {
+	c := rune(b)
+	switch {
+	case '0' <= c && c <= '9':
+		return c - '0', true
+	case 'a' <= c && c <= 'f':
+		return c - 'a' + 10, true
+	case 'A' <= c && c <= 'F':
+		return c - 'A' + 10, true
+	}
+	return
+}

diff --git a/cue/literal/string_test.go b/cue/literal/string_test.go
new file mode 100644
index 0000000..dd00b64
--- /dev/null
+++ b/cue/literal/string_test.go

@@ -0,0 +1,164 @@
+// Copyright 2019 CUE Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package literal
+
+import (
+	"fmt"
+	"testing"
+)
+
+func TestUnquote(t *testing.T) {
+	testCases := []struct {
+		in, out string
+		err     error
+	}{
+		{`"Hello"`, "Hello", nil},
+		{`'Hello'`, "Hello", nil},
+		{`'Hellø'`, "Hellø", nil},
+		{`"""` + "\n\t\tHello\n\t\t" + `"""`, "Hello", nil},
+		{"'''\n\t\tHello\n\t\t'''", "Hello", nil},
+		{"'''\n\t\tHello\n\n\t\t'''", "Hello\n", nil},
+		{"'''\n\n\t\tHello\n\t\t'''", "\nHello", nil},
+		{"'''\n\n\n\n\t\t'''", "\n\n", nil},
+		{"'''\n\t\t'''", "", nil},
+		{`"""` + "\n\raaa\n\rbbb\n\r" + `"""`, "aaa\nbbb", nil},
+		{`'\a\b\f\n\r\t\v\'\\\/'`, "\a\b\f\n\r\t\v'\\/", nil},
+		{`"\a\b\f\n\r\t\v\"\\\/"`, "\a\b\f\n\r\t\v\"\\/", nil},
+		{`#"The sequence "\U0001F604" renders as \#U0001F604."#`,
+			`The sequence "\U0001F604" renders as 😄.`,
+			nil},
+		{`"  \U00010FfF"`, "  \U00010fff", nil},
+		{`"\u0061 "`, "a ", nil},
+		{`'\x61\x55'`, "\x61\x55", nil},
+		{`'\061\055'`, "\061\055", nil},
+		{`'\377 '`, "\377 ", nil},
+		{"'e\u0300\\n'", "e\u0300\n", nil},
+		{`'\06\055'`, "", errSyntax},
+		{`'\0'`, "", errSyntax},
+		{`"\06\055"`, "", errSyntax},    // too short
+		{`'\777 '`, "", errSyntax},      // overflow
+		{`'\U012301'`, "", errSyntax},   // too short
+		{`'\U0123012G'`, "", errSyntax}, // invalid digit G
+		{`"\x04"`, "", errSyntax},       // not allowed in strings
+		{`'\U01230123'`, "", errSyntax}, // too large
+
+		{`"\\"`, "\\", nil},
+		{`"\'"`, "", errSyntax},
+		{`"\q"`, "", errSyntax},
+		{"'\n'", "", errSyntax},
+		{"'---\n---'", "", errSyntax},
+		{"'''\r'''", "", errMissingNewline},
+
+		{`#"Hello"#`, "Hello", nil},
+		{`#"Hello\v"#`, "Hello\\v", nil},
+		{`#"Hello\#v\r"#`, "Hello\v\\r", nil},
+		{`##"Hello\##v\r"##`, "Hello\v\\r", nil},
+		{`##"Hello\##v"##`, "Hello\v", nil},
+		{"#'''\n\t\tHello\\#v\n\t\t'''#", "Hello\v", nil},
+		{"##'''\n\t\tHello\\#v\n\t\t'''##", "Hello\\#v", nil},
+		{`#"""` + "\n\t\t\\#r\n\t\t" + `"""#`, "\r", nil},
+		{`#""#`, "", nil},
+		{`#"This is a "dog""#`, `This is a "dog"`, nil},
+		{"#\"\"\"\n\"\n\"\"\"#", `"`, nil},
+		{"#\"\"\"\n\"\"\"\n\"\"\"#", `"""`, nil},
+		{"#\"\"\"\n\na\n\n\"\"\"#", "\na\n", nil},
+		// Gobble extra \r
+		{"#\"\"\"\n\ra\n\r\"\"\"#", `a`, nil},
+		{"#\"\"\"\n\r\n\ra\n\r\n\r\"\"\"#", "\na\n", nil},
+		// Make sure this works for Windows.
+		{"#\"\"\"\r\n\r\na\r\n\r\n\"\"\"#", "\na\n", nil},
+		{"#\"\"\"\r\n \r\n a\r\n \r\n \"\"\"#", "\na\n", nil},
+		{"#\"\"\"\r\na\r\n\"\"\"#", `a`, nil},
+		{"#\"\"\"\r\n\ra\r\n\r\"\"\"#", `a`, nil},
+		{`####"   \"####`, `   \`, nil},
+
+		{"```", "", errSyntax},
+		{"Hello", "", errSyntax},
+		{`"Hello`, "", errUnmatchedQuote},
+		{`"""Hello"""`, "", errMissingNewline},
+		{"'''\n  Hello\n   '''", "", errInvalidWhitespace},
+		{"'''\n   a\n  b\n   '''", "", errInvalidWhitespace},
+		{`"Hello""`, "", errSyntax},
+		{`#"Hello"`, "", errUnmatchedQuote},
+		{`#"Hello'#`, "", errUnmatchedQuote},
+		{`#"""#`, "", errMissingNewline},
+
+		// TODO: should these be legal?
+		{`#"""#`, "", errMissingNewline},
+	}
+	for i, tc := range testCases {
+		t.Run(fmt.Sprintf("%d/%s", i, tc.in), func(t *testing.T) {
+			if got, err := Unquote(tc.in); err != tc.err {
+				t.Errorf("error: got %q; want %q", err, tc.err)
+			} else if got != tc.out {
+				t.Errorf("value: got %q; want %q", got, tc.out)
+			}
+		})
+	}
+}
+
+func TestInterpolation(t *testing.T) {
+	testCases := []struct {
+		quotes string
+		in     string
+		out    string
+		err    error
+	}{
+		{`""`, `foo\(`, "foo", nil},
+		{`"""` + "\n" + `"""`, `foo`, "", errUnmatchedQuote},
+		{`#""#`, `foo\#(`, "foo", nil},
+		{`#""#`, `foo\(`, "", errUnmatchedQuote},
+		{`""`, `foo\(bar`, "", errSyntax},
+		{`""`, ``, "", errUnmatchedQuote},
+		{`#""#`, `"`, "", errUnmatchedQuote},
+		{`#""#`, `\`, "", errUnmatchedQuote},
+		{`##""##`, `\'`, "", errUnmatchedQuote},
+	}
+	for i, tc := range testCases {
+		t.Run(fmt.Sprintf("%d/%s/%s", i, tc.quotes, tc.in), func(t *testing.T) {
+			info, _, _, _ := ParseQuotes(tc.quotes, tc.quotes)
+			if got, err := info.Unquote(tc.in); err != tc.err {
+				t.Errorf("error: got %q; want %q", err, tc.err)
+			} else if got != tc.out {
+				t.Errorf("value: got %q; want %q", got, tc.out)
+			}
+		})
+	}
+}
+
+func TestIsDouble(t *testing.T) {
+	testCases := []struct {
+		quotes string
+		double bool
+	}{
+		{`""`, true},
+		{`"""` + "\n" + `"""`, true},
+		{`#""#`, true},
+		{`''`, false},
+		{`'''` + "\n" + `'''`, false},
+		{`#''#`, false},
+	}
+	for i, tc := range testCases {
+		t.Run(fmt.Sprintf("%d/%s", i, tc.quotes), func(t *testing.T) {
+			info, _, _, err := ParseQuotes(tc.quotes, tc.quotes)
+			if err != nil {
+				t.Fatal(err)
+			}
+			if got := info.IsDouble(); got != tc.double {
+				t.Errorf("got %v; want %v", got, tc.double)
+			}
+		})
+	}
+}

diff --git a/cue/parser/parser.go b/cue/parser/parser.go
index 461b78b..77f483b 100644
--- a/cue/parser/parser.go
+++ b/cue/parser/parser.go

@@ -16,12 +16,12 @@
 
 import (
 	"fmt"
-	"strconv"
 	"strings"
 	"unicode"
 
 	"cuelang.org/go/cue/ast"
 	"cuelang.org/go/cue/errors"
+	"cuelang.org/go/cue/literal"
 	"cuelang.org/go/cue/scanner"
 	"cuelang.org/go/cue/token"
 )
@@ -1225,7 +1225,7 @@
 
 func isValidImport(lit string) bool {
 	const illegalChars = `!"#$%&'()*,:;<=>?[\]^{|}` + "`\uFFFD"
-	s, _ := strconv.Unquote(lit) // go/scanner returns a legal string literal
+	s, _ := literal.Unquote(lit) // go/scanner returns a legal string literal
 	for _, r := range s {
 		if !unicode.IsGraphic(r) || unicode.IsSpace(r) || strings.ContainsRune(illegalChars, r) {
 			return false

diff --git a/pkg/strconv/manual.go b/pkg/strconv/manual.go
index e179ad4..cad0a30 100644
--- a/pkg/strconv/manual.go
+++ b/pkg/strconv/manual.go

@@ -14,11 +14,13 @@
 
 package strconv
 
+import "cuelang.org/go/cue/literal"
+
 // Unquote interprets s as a single-quoted, double-quoted,
 // or backquoted CUE string literal, returning the string value
 // that s quotes.
 func Unquote(s string) (string, error) {
-	return Unquote(s)
+	return literal.Unquote(s)
 }
 
 // TODO: replace parsing functions with parsing to apd
commit	f54e594863f0cbd75a4cc04ab0c6dfd4dcdb4932	[log] [tgz]
author	Marcel van Lohuizen <mpvl@google.com>	Tue Feb 19 01:28:58 2019 +0000
committer	Gerrit Code Review <noreply-gerritcodereview@google.com>	Tue Feb 19 01:28:58 2019 +0000
tree	6f70def4af53efedecd5be51a5022a18fafd0479
parent	bbdfb896e3cc2248b69b17fb2cd875d852c0c28b [diff]
parent	6ceb601c732b2a118a149a28786835412f4bfb2b [diff]