cmd/cue/cmd: merge data files by default

This also harmonizes handling of schema versus data.

The distinction between data and schema is not always
clear. Yet the cue command makes a notable distinction
on how the two are handled and even varied behavior
based on whether the two were mixed or not.

Also, there was a difference in behavior based on CUE
vs JSON files: CUE files were merged in a single package,
whereas JSON files were printed one-by-one.

Advantages:
- less distinction between different formats
- merge JSON files on the command line
- merge schema and data on the command line

There are a few exceptions:
- cue vet still validates data files individually
- schema are separated when the -d flag is used.

The latter is kind of an eye sore and we probably need
a different approach selecting schema.

Change-Id: I4d61b363d4ad08f4d0cfb1fe4bdf4f15b106988e
Reviewed-on: https://cue-review.googlesource.com/c/cue/+/5745
Reviewed-by: Marcel van Lohuizen <mpvl@golang.org>
diff --git a/cmd/cue/cmd/common.go b/cmd/cue/cmd/common.go
index 0508105..ce20539 100644
--- a/cmd/cue/cmd/common.go
+++ b/cmd/cue/cmd/common.go
@@ -128,8 +128,8 @@
 	// If orphanFiles are mixed with CUE files and/or if placement flags are used,
 	// the instance is also included in insts.
 	importing      bool
-	orphanedData   []*build.File
-	orphanedSchema []*build.File
+	mergeData      bool // do not merge individual data files.
+	orphaned       []*build.File
 	orphanInstance *build.Instance
 	// imported files are files that were orphaned in the build instance, but
 	// were placed in the instance by using one the --files, --list or --path
@@ -151,7 +151,7 @@
 // instance, with which the data instance may be merged.
 func (b *buildPlan) instances() iterator {
 	var i iterator
-	if len(b.orphanedData) == 0 && len(b.orphanedSchema) == 0 {
+	if len(b.orphaned) == 0 {
 		i = &instanceIterator{a: buildInstances(b.cmd, b.insts), i: -1}
 	} else {
 		i = newStreamingIterator(b)
@@ -208,7 +208,7 @@
 func newStreamingIterator(b *buildPlan) *streamingIterator {
 	i := &streamingIterator{
 		cfg: b.encConfig,
-		a:   b.orphanedData,
+		a:   b.orphaned,
 		b:   b,
 	}
 
@@ -355,6 +355,8 @@
 	fileFilter     string
 	interpretation build.Interpretation
 
+	noMerge bool // do not merge individual data files.
+
 	loadCfg *load.Config
 }
 
@@ -394,53 +396,85 @@
 			p.insts = append(p.insts, b)
 			continue
 		}
+		addedUser := false
 		if len(b.BuildFiles) > 0 {
+			addedUser = true
 			p.insts = append(p.insts, b)
 		}
 		if ok {
 			continue
 		}
 
-		if len(b.OrphanedFiles) > 0 {
-			if p.orphanInstance != nil {
-				return nil, errors.Newf(token.NoPos,
-					"builds contain two file packages")
-			}
-			p.orphanInstance = b
-			p.encConfig.Stream = true
+		if len(b.OrphanedFiles) == 0 {
+			continue
 		}
 
+		if p.orphanInstance != nil {
+			return nil, errors.Newf(token.NoPos,
+				"builds contain two file packages")
+		}
+		p.orphanInstance = b
+		p.encConfig.Stream = true
+
 		for _, f := range b.OrphanedFiles {
-			switch f.Interpretation {
-			case build.JSONSchema, build.OpenAPI:
-				p.orphanedSchema = append(p.orphanedSchema, f)
-				continue
-			}
 			switch f.Encoding {
-			case build.Protobuf:
-				p.orphanedSchema = append(p.orphanedSchema, f)
-			case build.YAML, build.JSON, build.Text:
-				p.orphanedData = append(p.orphanedData, f)
+			case build.Protobuf, build.YAML, build.JSON, build.Text:
 			default:
 				return nil, errors.Newf(token.NoPos,
 					"unsupported encoding %q", f.Encoding)
 			}
 		}
 
-		switch {
-		case len(p.insts) > 0 && len(p.orphanedSchema) > 0:
-			return nil, errors.Newf(token.NoPos,
-				"cannot define packages and schema")
-		case len(p.orphanedData) > 0 && len(p.orphanedSchema) > 1:
-			// TODO: allow this when schema have ID specified.
-			return nil, errors.Newf(token.NoPos,
-				"cannot define data with more than one schema")
-		case len(p.orphanedData) > 0 && len(p.orphanedSchema) == 1:
-			b.BuildFiles = append(b.BuildFiles, p.orphanedSchema...)
-			p.insts = append(p.insts, b)
-		case len(p.orphanedSchema) > 0:
-			p.orphanedData = p.orphanedSchema
+		if !p.mergeData && p.schema == nil {
+			p.orphaned = append(p.orphaned, b.OrphanedFiles...)
+			continue
 		}
+
+		// TODO: this processing could probably be delayed, or at least
+		// simplified. The main reason to do this here is to allow interpreting
+		// the --schema/-d flag, while allowing to use this for OpenAPI and
+		// JSON Schema in auto-detect mode.
+		buildFiles := []*build.File{}
+		for _, f := range b.OrphanedFiles {
+			d := encoding.NewDecoder(f, p.encConfig)
+			for ; !d.Done(); d.Next() {
+				file := d.File()
+				sub := &build.File{
+					Filename:       d.Filename(),
+					Encoding:       f.Encoding,
+					Interpretation: d.Interpretation(),
+					Form:           f.Form,
+					Tags:           f.Tags,
+					Source:         file,
+				}
+				if p.schema != nil && d.Interpretation() == "" {
+					switch sub.Encoding {
+					case build.YAML, build.JSON, build.Text:
+						p.orphaned = append(p.orphaned, sub)
+						continue
+					}
+				}
+				buildFiles = append(buildFiles, sub)
+				if err := b.AddSyntax(file); err != nil {
+					return nil, err
+				}
+			}
+			if err := d.Err(); err != nil {
+				return nil, err
+			}
+		}
+
+		if !addedUser && len(b.Files) > 1 {
+			p.insts = append(p.insts, b)
+		} else if len(p.orphaned) == 0 {
+			// Instance with only a single build: just print the file.
+			p.orphaned = append(p.orphaned, buildFiles...)
+		}
+	}
+
+	if len(p.insts) > 1 && p.schema != nil {
+		return nil, errors.Newf(token.NoPos,
+			"cannot use --schema/-d flag more than one schema")
 	}
 
 	if len(p.expressions) > 1 {
@@ -450,6 +484,8 @@
 }
 
 func (b *buildPlan) parseFlags() (err error) {
+	b.mergeData = !b.cfg.noMerge && flagMerge.Bool(b.cmd)
+
 	out := flagOut.String(b.cmd)
 	outFile := flagOutFile.String(b.cmd)
 
diff --git a/cmd/cue/cmd/flags.go b/cmd/cue/cmd/flags.go
index 26ef387..131a2be 100644
--- a/cmd/cue/cmd/flags.go
+++ b/cmd/cue/cmd/flags.go
@@ -38,6 +38,7 @@
 	flagEscape      flagName = "escape"
 	flagGlob        flagName = "name"
 	flagRecursive   flagName = "recursive"
+	flagMerge       flagName = "merge"
 	flagList        flagName = "list"
 	flagPath        flagName = "path"
 	flagFiles       flagName = "files"
@@ -79,6 +80,7 @@
 	f.Bool(string(flagWithContext), false, "import as object with contextual data")
 	f.StringArrayP(string(flagProtoPath), "I", nil, "paths in which to search for imports")
 	f.StringP(string(flagGlob), "n", "", "glob filter for file names")
+	f.Bool(string(flagMerge), true, "merge non-CUE files")
 }
 
 type flagName string
diff --git a/cmd/cue/cmd/help.go b/cmd/cue/cmd/help.go
index 2f580a1..90218c6 100644
--- a/cmd/cue/cmd/help.go
+++ b/cmd/cue/cmd/help.go
@@ -69,23 +69,20 @@
 
 Non-cue files are interpreted based on their file extension or,
 if present, an explicit file qualifier (see the "filetypes"
-help topic). Non-cue files may be interpreted as concrete data
-or schema. Schema are treated as single-file packages by default.
-See the "filetypes" and "flags" help topics on how to combine
-schema into a single package.
+help topic). By default, all recognized files are unified at
+their root value. See the "filetypes" and "flags" help topics
+on how to treat each file individually or how to combine them
+differently.
 
-Data files can be combined into a single package, in which case
-each file is unified into a defined location within this single
-package. If a data file has multiple values, such as allowed
-with JSON Lines or YAML, each value is interpreted as a separate
-file.
+If a data file has multiple values, such as allowed with JSON
+Lines or YAML, each value is interpreted as a separate file.
 
-The --schema/-d flag can be used to unify each data file against
-a specific schema within a non-data package. For OpenAPI, the -d
-flag specifies a schema name. For JSON Schema the -d flag
-specifies a schema defined in "definitions". In all other cases,
-the -d flag is a CUE expression that is evaluated within the
-package.
+If the --schema/-d is specified, data files are not merged, and
+are compared against the specified schema within a package or
+non-data file. For OpenAPI, the -d flag specifies a schema name.
+For JSON Schema the -d flag specifies a schema defined in
+"definitions". In all other cases, the -d flag is a CUE
+expression that is evaluated within the package.
 
 Examples (also see also "flags" and "filetypes" help topics):
 
@@ -104,9 +101,16 @@
 var flagsHelp = &cobra.Command{
 	Use:   "flags",
 	Short: "common flags for composing packages",
-	Long: `Non-CUE files are treated as individual files by
-default, but can be combined into a single package using a
-combination of the following flags.
+	Long: `Non-CUE files are merged at their roots by default.
+The can be combined differently or treated as different files
+by using a combination of the following flags.
+
+
+Individual files
+
+To treat non-cue files as individual files, use --no-merge flag.
+This is the default for vet. This flag only applies to data files
+when used in combination with the --schema/-d flag.
 
 
 Assigning values to a CUE path
diff --git a/cmd/cue/cmd/orphans.go b/cmd/cue/cmd/orphans.go
index a833bfd..94b2eea 100644
--- a/cmd/cue/cmd/orphans.go
+++ b/cmd/cue/cmd/orphans.go
@@ -47,7 +47,11 @@
 				flagWithContext, flagPath, flagList, flagFiles,
 			)
 		}
-		return false, err
+		// TODO: should we remove this optimization? This is really added as a
+		// conversion safety.
+		if len(i.OrphanedFiles)+len(i.BuildFiles) <= 1 || b.cfg.noMerge {
+			return false, err
+		}
 	}
 
 	pkg := b.encConfig.PkgName
@@ -85,8 +89,9 @@
 
 		// Filter only need to filter files that can stream:
 		for ; !d.Done(); d.Next() {
-			if expr := d.File(); expr != nil {
-				objs = append(objs, expr)
+			if f := d.File(); f != nil {
+				f.Filename = newName(d.Filename(), 0)
+				objs = append(objs, f)
 			}
 		}
 		if err := d.Err(); err != nil {
@@ -104,18 +109,31 @@
 			}
 			continue
 		}
-		if len(objs) > 1 && len(path) == 0 && !useList {
+		// TODO: consider getting rid of this requirement. It is important that
+		// import will catch conflicts ahead of time then, though, and report
+		// this messages as a possible solution if there are conflicts.
+		if b.importing && len(objs) > 1 && len(path) == 0 && !useList {
 			return false, fmt.Errorf(
 				"%s, %s, or %s flag needed to handle multiple objects in file %s",
 				flagPath, flagList, flagFiles, shortFile(i.Root, f))
 		}
 
-		f, err := placeOrphans(b.cmd, d.Filename(), pkg, objs...)
-		if err != nil {
-			return false, err
+		if !useList && len(path) == 0 && !useContext {
+			for _, f := range objs {
+				if pkg := c.PkgName; pkg != "" {
+					internal.SetPackage(f, pkg, false)
+				}
+				files = append(files, f)
+			}
+		} else {
+			// TODO: handle imports correctly, i.e. for proto.
+			f, err := placeOrphans(b.cmd, d.Filename(), pkg, objs...)
+			if err != nil {
+				return false, err
+			}
+			f.Filename = newName(d.Filename(), 0)
+			files = append(files, f)
 		}
-		f.Filename = newName(d.Filename(), 0)
-		files = append(files, f)
 	}
 
 	b.imported = append(b.imported, files...)
@@ -241,8 +259,7 @@
 	}
 
 	if pkg != "" {
-		p := &ast.Package{Name: ast.NewIdent(pkg)}
-		f.Decls = append([]ast.Decl{p}, f.Decls...)
+		internal.SetPackage(f, pkg, false)
 	}
 
 	if flagList.Bool(cmd) {
diff --git a/cmd/cue/cmd/testdata/script/def_openapi.txt b/cmd/cue/cmd/testdata/script/def_openapi.txt
index 47d0b95..500b88e 100644
--- a/cmd/cue/cmd/testdata/script/def_openapi.txt
+++ b/cmd/cue/cmd/testdata/script/def_openapi.txt
@@ -20,6 +20,10 @@
 cue def openapi+cue: expect-cue-out -o -
 cmp stdout expect-cue2
 
+# combine
+cue def -p foo openapi.json openapi2.json
+cmp stdout expect-cue3
+
 -- foo.cue --
 // Some clever title.
 
@@ -75,6 +79,36 @@
         }
     }
 }
+-- openapi2.json --
+{
+    "openapi": "3.0.0",
+    "info": {
+        "title":   "My OpenAPI2",
+        "version": "v1alpha1"
+    },
+    "paths": {},
+    "components": {
+        "schemas": {
+            "Baz": {
+                "type": "object",
+                "required": [
+                    "a",
+                    "b"
+                ],
+                "properties": {
+                    "a": {
+                        "type": "integer"
+                    },
+                    "b": {
+                        "type": "integer",
+                        "minimum": 0,
+                        "exclusiveMaximum": 10
+                    }
+                }
+            }
+        }
+    }
+}
 -- expect-json-out --
 {
     "openapi": "3.0.0",
@@ -201,3 +235,39 @@
 	b: >=0 & <10
 	...
 }
+-- expect-cue2 --
+info: {
+	title:   *"Some clever title." | string
+	version: *"v1" | string
+}
+Bar :: {
+	foo: Foo
+	...
+}
+Foo :: {
+	a: int
+	b: >=0 & <10
+	...
+}
+-- expect-cue3 --
+// My OpenAPI
+package foo
+
+info: {
+	title:   string | *_|_
+	version: *"v1alpha1" | string
+}
+Bar :: {
+	foo: Foo
+	...
+}
+Foo :: {
+	a: int
+	b: >=0 & <10
+	...
+}
+Baz :: {
+	a: int
+	b: >=0 & <10
+	...
+}
diff --git a/cmd/cue/cmd/testdata/script/vet_altdata.txt b/cmd/cue/cmd/testdata/script/vet_altdata.txt
index b6a3617..d07992b 100644
--- a/cmd/cue/cmd/testdata/script/vet_altdata.txt
+++ b/cmd/cue/cmd/testdata/script/vet_altdata.txt
@@ -13,8 +13,6 @@
 
 -- export-stdout --
 {
-    "a": "b"
-}
-{
+    "a": "b",
     "c": "d"
 }
diff --git a/cmd/cue/cmd/testdata/script/vet_data.txt b/cmd/cue/cmd/testdata/script/vet_data.txt
index 7831007..5d36377 100644
--- a/cmd/cue/cmd/testdata/script/vet_data.txt
+++ b/cmd/cue/cmd/testdata/script/vet_data.txt
@@ -28,6 +28,9 @@
     ./schema.cue:3:8
     ./data.yaml:5:12
 -- export-stderr --
+languages.2.tag: conflicting values string and false (mismatched types string and bool):
+    ./data.yaml:6:11
+    ./schema.cue:2:8
 languages.1.name: invalid value "dutch" (does not match =~"^\\p{Lu}"):
     ./schema.cue:3:8
     ./data.yaml:5:12
diff --git a/cmd/cue/cmd/vet.go b/cmd/cue/cmd/vet.go
index 1c26f42..deb623d 100644
--- a/cmd/cue/cmd/vet.go
+++ b/cmd/cue/cmd/vet.go
@@ -89,13 +89,15 @@
 // TODO: allow unrooted schema, such as JSON schema to compare against
 // other values.
 func doVet(cmd *Command, args []string) error {
-	b, err := parseArgs(cmd, args, nil)
+	b, err := parseArgs(cmd, args, &config{
+		noMerge: true,
+	})
 	exitOnErr(cmd, err, true)
 
 	// Go into a special vet mode if the user explicitly specified non-cue
 	// files on the command line.
 	// TODO: unify these two modes.
-	if len(b.orphanedData) > 0 {
+	if len(b.orphaned) > 0 {
 		vetFiles(cmd, b)
 		return nil
 	}
diff --git a/internal/encoding/encoding.go b/internal/encoding/encoding.go
index 379e378..03fc7ab 100644
--- a/internal/encoding/encoding.go
+++ b/internal/encoding/encoding.go
@@ -43,16 +43,17 @@
 )
 
 type Decoder struct {
-	cfg       *Config
-	closer    io.Closer
-	next      func() (ast.Expr, error)
-	interpret interpretFunc
-	expr      ast.Expr
-	file      *ast.File
-	filename  string // may change on iteration for some formats
-	id        string
-	index     int
-	err       error
+	cfg            *Config
+	closer         io.Closer
+	next           func() (ast.Expr, error)
+	interpretFunc  interpretFunc
+	interpretation build.Interpretation
+	expr           ast.Expr
+	file           *ast.File
+	filename       string // may change on iteration for some formats
+	id             string
+	index          int
+	err            error
 }
 
 type interpretFunc func(*cue.Instance) (file *ast.File, id string, err error)
@@ -62,10 +63,14 @@
 func (i *Decoder) ID() string {
 	return i.id
 }
-
 func (i *Decoder) Filename() string { return i.filename }
-func (i *Decoder) Index() int       { return i.index }
-func (i *Decoder) Done() bool       { return i.err != nil }
+
+// Interpretation returns the current interpretation detected by Detect.
+func (i *Decoder) Interpretation() build.Interpretation {
+	return i.interpretation
+}
+func (i *Decoder) Index() int { return i.index }
+func (i *Decoder) Done() bool { return i.err != nil }
 
 func (i *Decoder) Next() {
 	if i.err != nil {
@@ -83,7 +88,7 @@
 
 func (i *Decoder) doInterpret() {
 	// Interpretations
-	if i.interpret != nil {
+	if i.interpretFunc != nil {
 		var r cue.Runtime
 		i.file = i.File()
 		inst, err := r.CompileFile(i.file)
@@ -91,7 +96,7 @@
 			i.err = err
 			return
 		}
-		i.file, i.id, i.err = i.interpret(inst)
+		i.file, i.id, i.err = i.interpretFunc(inst)
 	}
 }
 
@@ -176,8 +181,8 @@
 	case build.Auto:
 		openAPI := openAPIFunc(cfg, f)
 		jsonSchema := jsonSchemaFunc(cfg, f)
-		i.interpret = func(inst *cue.Instance) (file *ast.File, id string, err error) {
-			switch Detect(inst.Value()) {
+		i.interpretFunc = func(inst *cue.Instance) (file *ast.File, id string, err error) {
+			switch i.interpretation = Detect(inst.Value()); i.interpretation {
 			case build.JSONSchema:
 				return jsonSchema(inst)
 			case build.OpenAPI:
@@ -186,9 +191,11 @@
 			return i.file, "", i.err
 		}
 	case build.OpenAPI:
-		i.interpret = openAPIFunc(cfg, f)
+		i.interpretation = build.OpenAPI
+		i.interpretFunc = openAPIFunc(cfg, f)
 	case build.JSONSchema:
-		i.interpret = jsonSchemaFunc(cfg, f)
+		i.interpretation = build.JSONSchema
+		i.interpretFunc = jsonSchemaFunc(cfg, f)
 	default:
 		i.err = fmt.Errorf("unsupported interpretation %q", f.Interpretation)
 	}
diff --git a/internal/internal.go b/internal/internal.go
index 58b69ee..1cf2584 100644
--- a/internal/internal.go
+++ b/internal/internal.go
@@ -30,6 +30,7 @@
 	"golang.org/x/xerrors"
 
 	"cuelang.org/go/cue/ast"
+	"cuelang.org/go/cue/ast/astutil"
 	"cuelang.org/go/cue/errors"
 	"cuelang.org/go/cue/token"
 )
@@ -113,6 +114,32 @@
 	return nil, "", f.Pos()
 }
 
+func SetPackage(f *ast.File, name string, overwrite bool) {
+	p, str, _ := PackageInfo(f)
+	if p != nil {
+		if !overwrite || str == name {
+			return
+		}
+		ident := ast.NewIdent(name)
+		astutil.CopyMeta(ident, p.Name)
+		return
+	}
+
+	decls := make([]ast.Decl, len(f.Decls)+1)
+	k := 0
+	for _, d := range f.Decls {
+		if _, ok := d.(*ast.CommentGroup); ok {
+			decls[k] = d
+			k++
+			continue
+		}
+		break
+	}
+	decls[k] = &ast.Package{Name: ast.NewIdent(name)}
+	copy(decls[k+1:], f.Decls[k:])
+	f.Decls = decls
+}
+
 // NewComment creates a new CommentGroup from the given text.
 // Each line is prefixed with "//" and the last newline is removed.
 // Useful for ASTs generated by code other than the CUE parser.