internal/core/adt: strip "notDefault" status for disjunctions with unused defaults

This most fixes all known default issues. The semantics is still not
entirely right, and will require a slightly different order of evaluation for
disjunctions. This is best addressed when doing some of the planned
performance enhancements for disjunctions.

Fixes #763
Fixes #726

This CL changes the semantics as follows:
1) a marked disjunction becomes an unmarked disjunction if,
after the conjuncts are distributed over disjuncts, none of
its marked disjuncts remain
2) a nested disjunction a | b | (nestedA | nestedC) is
evaluated according to the same rules, and then incorporated as is.

This implementation still slightly deviates from these rules:
The probem with the current implementation is that nested
disjunctions are unified agaist existing conjuncts, and possibly
eliminated early, before applying the rules. So in
```
     ("a" | "b") & (*(*"a" | string) | string)
```
the inner `*"a" | string` is combined with `"a" | "b"` before it
is determined whether its result is a marked disjunction. This was
okay with the previous semantics and had the benefit of allowing to
eliminate some disjuncts earlier, but no longer works with the new
semantics. Also if we use structure sharing, the benefits of reuse
will likely be bigger then early elimination in this case.

Some of the complexity of the logic in this CL is introduced
to deal with this discrepency. It can be simplified again
considerably when the required change in control flow is
implemented.

Change-Id: I638bbb66ba69bd1642261dd2628dfcc27b14f893
Reviewed-on: https://cue-review.googlesource.com/c/cue/+/8706
Reviewed-by: Marcel van Lohuizen <mpvl@golang.org>
diff --git a/cue/testdata/disjunctions/embed.txtar b/cue/testdata/disjunctions/embed.txtar
index 72de3c1..5d8e063 100644
--- a/cue/testdata/disjunctions/embed.txtar
+++ b/cue/testdata/disjunctions/embed.txtar
@@ -35,6 +35,32 @@
     {}
 }
 
+defaultsMulti: {
+    a: {
+        #def: {
+            *{} | {a: string} | {b: string}
+            *{} | {c: string} | {d: string}
+        }
+
+        a: #def & {a: "foo"}
+    }
+
+    b: {
+        #def: {
+            *{} | {a: string} | {b: string}
+            *{} | {c: string} | {d: string}
+            *{} | {d: string} | {e: string}
+        }
+
+        a: #def & {a: "foo", e: "bar"}
+    }
+}
+
+nested: {
+	a: 1 | 2 | *(
+		(3 | 4 | *( 5 | 6 | *7)) & ( 3 | 4 | ( *7 | 8 )))
+}
+
 -- out/eval --
 (struct){
   default: (struct){
@@ -83,6 +109,137 @@
       }) }
     a: (int){ 2 }
   }
+  defaultsMulti: (struct){
+    a: (struct){
+      #def: (struct){ |(*(#struct){
+        }, (#struct){
+          c: (string){ string }
+        }, (#struct){
+          d: (string){ string }
+        }, (#struct){
+          a: (string){ string }
+        }, (#struct){
+          a: (string){ string }
+          c: (string){ string }
+        }, (#struct){
+          a: (string){ string }
+          d: (string){ string }
+        }, (#struct){
+          b: (string){ string }
+        }, (#struct){
+          b: (string){ string }
+          c: (string){ string }
+        }, (#struct){
+          b: (string){ string }
+          d: (string){ string }
+        }) }
+      a: (struct){ |(*(#struct){
+          a: (string){ "foo" }
+        }, (#struct){
+          a: (string){ "foo" }
+          c: (string){ string }
+        }, (#struct){
+          a: (string){ "foo" }
+          d: (string){ string }
+        }) }
+    }
+    b: (struct){
+      #def: (struct){ |(*(#struct){
+        }, (#struct){
+          d: (string){ string }
+        }, (#struct){
+          e: (string){ string }
+        }, (#struct){
+          c: (string){ string }
+        }, (#struct){
+          c: (string){ string }
+          d: (string){ string }
+        }, (#struct){
+          c: (string){ string }
+          e: (string){ string }
+        }, (#struct){
+          d: (string){ string }
+        }, (#struct){
+          d: (string){ string }
+        }, (#struct){
+          d: (string){ string }
+          e: (string){ string }
+        }, (#struct){
+          a: (string){ string }
+        }, (#struct){
+          a: (string){ string }
+          d: (string){ string }
+        }, (#struct){
+          a: (string){ string }
+          e: (string){ string }
+        }, (#struct){
+          a: (string){ string }
+          c: (string){ string }
+        }, (#struct){
+          a: (string){ string }
+          c: (string){ string }
+          d: (string){ string }
+        }, (#struct){
+          a: (string){ string }
+          c: (string){ string }
+          e: (string){ string }
+        }, (#struct){
+          a: (string){ string }
+          d: (string){ string }
+        }, (#struct){
+          a: (string){ string }
+          d: (string){ string }
+        }, (#struct){
+          a: (string){ string }
+          d: (string){ string }
+          e: (string){ string }
+        }, (#struct){
+          b: (string){ string }
+        }, (#struct){
+          b: (string){ string }
+          d: (string){ string }
+        }, (#struct){
+          b: (string){ string }
+          e: (string){ string }
+        }, (#struct){
+          b: (string){ string }
+          c: (string){ string }
+        }, (#struct){
+          b: (string){ string }
+          c: (string){ string }
+          d: (string){ string }
+        }, (#struct){
+          b: (string){ string }
+          c: (string){ string }
+          e: (string){ string }
+        }, (#struct){
+          b: (string){ string }
+          d: (string){ string }
+        }, (#struct){
+          b: (string){ string }
+          d: (string){ string }
+        }, (#struct){
+          b: (string){ string }
+          d: (string){ string }
+          e: (string){ string }
+        }) }
+      a: (struct){ |(*(#struct){
+          a: (string){ "foo" }
+          e: (string){ "bar" }
+        }, (#struct){
+          a: (string){ "foo" }
+          e: (string){ "bar" }
+          c: (string){ string }
+        }, (#struct){
+          a: (string){ "foo" }
+          e: (string){ "bar" }
+          d: (string){ string }
+        }) }
+    }
+  }
+  nested: (struct){
+    a: (int){ |(*(int){ 7 }, (int){ 2 }, (int){ 3 }, (int){ 4 }, (int){ 1 }) }
+  }
 }
 -- out/compile --
 --- in.cue
@@ -128,4 +285,49 @@
     〈0;#y〉
     {}
   }
+  defaultsMulti: {
+    a: {
+      #def: {
+        (*{}|{
+          a: string
+        }|{
+          b: string
+        })
+        (*{}|{
+          c: string
+        }|{
+          d: string
+        })
+      }
+      a: (〈0;#def〉 & {
+        a: "foo"
+      })
+    }
+    b: {
+      #def: {
+        (*{}|{
+          a: string
+        }|{
+          b: string
+        })
+        (*{}|{
+          c: string
+        }|{
+          d: string
+        })
+        (*{}|{
+          d: string
+        }|{
+          e: string
+        })
+      }
+      a: (〈0;#def〉 & {
+        a: "foo"
+        e: "bar"
+      })
+    }
+  }
+  nested: {
+    a: (1|2|*((3|4|*(5|6|*7)) & (3|4|(*7|8))))
+  }
 }
diff --git a/cue/testdata/disjunctions/specdeviation.txtar b/cue/testdata/disjunctions/specdeviation.txtar
index 4ebf0d4..605639b 100644
--- a/cue/testdata/disjunctions/specdeviation.txtar
+++ b/cue/testdata/disjunctions/specdeviation.txtar
@@ -24,6 +24,12 @@
 P: 2
 p: *P | int // now 2, but should be (2 | int), according to the spec:
 
+// Here the inner default may not be used as it is masked by the outer default.
+r: (*3 | (*1 | 2)) & (1 | 2)
+
+// Here the inner default is used, as there are no defaults marked in the
+// outer disjunction.
+s: (3 | (*1 | 2)) & (1 | 2)
 
 s1: #Size & { min: 5 }
 
@@ -32,12 +38,39 @@
     res: uint | * 0
     min: >res | *(1 + res)
 }
+
+staged: {
+    c: ("a" | "b") & (*(*"a" | string) | string)
+    d: (*(*"a" | string) | string) & ("a" | "b")
+}
+
+issue763a: {
+  #A: {
+    v: "a" | "b" | "c" // change to string to fix
+  }
+
+  h: [string]: #A
+
+  h: [=~"^b"]: #A & {
+    v: *h.a.v | string
+  }
+
+  h: a: {
+    v: *"a" | string
+  }
+
+  h: baa: _
+  h: boo: _
+}
+
 -- out/eval --
 (struct){
   Q: (int){ |(*(int){ 1 }, (int){ int }) }
   q: (int){ |(*(int){ 1 }, (int){ int }) }
   P: (int){ 2 }
   p: (int){ |(*(int){ 2 }, (int){ int }) }
+  r: (int){ |((int){ 1 }, (int){ 2 }) }
+  s: (int){ |(*(int){ 1 }, (int){ 2 }) }
   s1: (#struct){
     max: (number){ |(*(int){ 5 }, (number){ >5 }) }
     res: (int){ |(*(int){ 0 }, (int){ &(>=0, int) }) }
@@ -48,6 +81,26 @@
     res: (int){ |(*(int){ 0 }, (int){ &(>=0, int) }) }
     min: (number){ |(*(int){ 1 }, (number){ >0 }) }
   }
+  staged: (struct){
+    c: (string){ |(*(string){ "a" }, (string){ "b" }) }
+    d: (string){ |(*(string){ "a" }, (string){ "b" }) }
+  }
+  issue763a: (struct){
+    #A: (#struct){
+      v: (string){ |((string){ "a" }, (string){ "b" }, (string){ "c" }) }
+    }
+    h: (struct){
+      a: (#struct){
+        v: (string){ |(*(string){ "a" }, (string){ "b" }, (string){ "c" }) }
+      }
+      baa: (#struct){
+        v: (string){ |(*(string){ "a" }, (string){ "b" }, (string){ "c" }) }
+      }
+      boo: (#struct){
+        v: (string){ |(*(string){ "a" }, (string){ "b" }, (string){ "c" }) }
+      }
+    }
+  }
 }
 -- out/compile --
 --- in.cue
@@ -57,6 +110,8 @@
   P: (*1|int)
   P: 2
   p: (*〈0;P〉|int)
+  r: ((*3|(*1|2)) & (1|2))
+  s: ((3|(*1|2)) & (1|2))
   s1: (〈0;#Size〉 & {
     min: 5
   })
@@ -65,4 +120,32 @@
     res: (&(int, >=0)|*0)
     min: (>〈0;res〉|*(1 + 〈0;res〉))
   }
+  staged: {
+    c: (("a"|"b") & (*(*"a"|string)|string))
+    d: ((*(*"a"|string)|string) & ("a"|"b"))
+  }
+  issue763a: {
+    #A: {
+      v: ("a"|"b"|"c")
+    }
+    h: {
+      [string]: 〈1;#A〉
+    }
+    h: {
+      [=~"^b"]: (〈1;#A〉 & {
+        v: (*〈2;h〉.a.v|string)
+      })
+    }
+    h: {
+      a: {
+        v: (*"a"|string)
+      }
+    }
+    h: {
+      baa: _
+    }
+    h: {
+      boo: _
+    }
+  }
 }
diff --git a/doc/ref/spec.md b/doc/ref/spec.md
index 923faf4..5809fec 100644
--- a/doc/ref/spec.md
+++ b/doc/ref/spec.md
@@ -711,6 +711,10 @@
 So `a | b | *c | d` is a single marked disjunction of four terms,
 whereas `a | (b | *c | d)` is an unmarked disjunction of two terms,
 one of which is a marked disjunction of three terms.
+During unification, if all the marked disjuncts of a marked disjunction are
+eliminated, then the remaining unmarked disjuncts are considered as if they
+originated from an unmarked disjunction
+<!-- TODO: this formulation should be worked out more.  -->
 As explained below, distinguishing the nesting of disjunctions like this
 is only relevant when both an outer and nested disjunction are marked.
 
diff --git a/internal/core/adt/disjunct.go b/internal/core/adt/disjunct.go
index 8d538a8..636f07d 100644
--- a/internal/core/adt/disjunct.go
+++ b/internal/core/adt/disjunct.go
@@ -88,6 +88,13 @@
 	expr        *DisjunctionExpr
 	value       *Disjunction
 	hasDefaults bool
+
+	// These are used for book keeping, tracking whether any of the
+	// disjuncts marked with a default marker remains after unification.
+	// If no default is used, all other elements are treated as "maybeDefault".
+	// Otherwise, elements are treated as is.
+	parentDefaultUsed bool
+	childDefaultUsed  bool
 }
 
 func (n *nodeContext) addDisjunction(env *Environment, x *DisjunctionExpr, cloneID CloseInfo) {
@@ -102,19 +109,19 @@
 	}
 
 	n.disjunctions = append(n.disjunctions,
-		envDisjunct{env, cloneID, x, nil, numDefaults > 0})
+		envDisjunct{env, cloneID, x, nil, numDefaults > 0, false, false})
 }
 
 func (n *nodeContext) addDisjunctionValue(env *Environment, x *Disjunction, cloneID CloseInfo) {
 	n.disjunctions = append(n.disjunctions,
-		envDisjunct{env, cloneID, nil, x, x.HasDefaults})
+		envDisjunct{env, cloneID, nil, x, x.HasDefaults, false, false})
 
 }
 
 func (n *nodeContext) expandDisjuncts(
 	state VertexStatus,
 	parent *nodeContext,
-	m defaultMode,
+	parentMode defaultMode, // default mode of this disjunct
 	recursive bool) {
 
 	n.ctx.stats.DisjunctCount++
@@ -136,6 +143,8 @@
 		n.snapshot = *n.node
 	}
 
+	defaultOffset := len(n.usedDefault)
+
 	switch {
 	default: // len(n.disjunctions) == 0
 		m := *n
@@ -180,6 +189,12 @@
 			n.node.BaseValue = n.getValidators()
 		}
 
+		n.usedDefault = append(n.usedDefault, defaultInfo{
+			parentMode: parentMode,
+			nestedMode: parentMode,
+			origMode:   parentMode,
+		})
+
 	case len(n.disjunctions) > 0:
 		// Process full disjuncts to ensure that erroneous disjuncts are
 		// eliminated as early as possible.
@@ -187,33 +202,8 @@
 
 		n.disjuncts = append(n.disjuncts, n)
 
-		// HACK: this is an AWFUL, AWFUL HACK to work around a limitation of
-		// using defaults for marking oneOfs in protobuffers. Previously this
-		// was worked around by another hack that (deliberately erroneously)
-		// would move the default status of a disjunct of which only one
-		// disjunct remained from "not default" to "maybe default". For
-		// protobuf oneOfs this would mean that only the "intended" struct would
-		// get the default value. It also worked around various other
-		// limitations.
-		//
-		// With the latest performance enhancements this old hack still worked,
-		// but only because it introduced a bug that this hack relied on. Fixing
-		// this bug now causes this hack to no longer work.
-		//
-		// Ultimately, the correct way to address the issue for the protobuf
-		// representation is not to use defaults at all. Instead we should use
-		// the required annotator (which fixes a whole load of other issues):
-		//
-		//    {} | {a!: int} | {b!: int}
-		//
-		// This would force that only one of these can be true independent of
-		// default magic. Aside from fixing this issue, it also moves to a model
-		// that is consistent with the recommendation to not use defaults in
-		// a top-level API specification.
-		//
-		// The hack we use now recognizes the oneOf patterns and then sets the
-		// default for the "smallest" element.
-		protoForm := true
+		n.refCount++
+		defer n.free()
 
 		for i, d := range n.disjunctions {
 			a := n.disjuncts
@@ -225,23 +215,6 @@
 				n.ctx.inDisjunct++
 			}
 
-			// HACK: see above
-			defaultCount := 0
-			override := true
-			if d.expr == nil {
-				override = false
-			} else {
-				for _, v := range d.expr.Values {
-					if !d.hasDefaults || v.Default {
-						defaultCount++
-						if s, ok := v.Val.(*StructLit); !ok || len(s.Decls) > 0 {
-							protoForm = false
-							override = false
-						}
-					}
-				}
-			}
-
 			for _, dn := range a {
 				switch {
 				case d.expr != nil:
@@ -253,14 +226,7 @@
 						c := MakeConjunct(d.env, v.Val, d.cloneID)
 						cn.addExprConjunct(c)
 
-						if override {
-							if s, ok := v.Val.(*StructLit); ok && len(s.Decls) == 0 {
-								cn.protoCount++
-							}
-						}
-
 						newMode := mode(d.hasDefaults, v.Default)
-						cn.defaultMode = combineDefault(dn.defaultMode, newMode)
 
 						cn.expandDisjuncts(state, n, newMode, true)
 					}
@@ -274,7 +240,6 @@
 						cn.addValueConjunct(d.env, v, d.cloneID)
 
 						newMode := mode(d.hasDefaults, i < d.value.NumDefaults)
-						cn.defaultMode = combineDefault(dn.defaultMode, newMode)
 
 						cn.expandDisjuncts(state, n, newMode, true)
 					}
@@ -300,39 +265,98 @@
 			}
 		}
 
-		// HACK: see above
-		if protoForm {
-			min := int32(0)
-			minPos := 0
-			for i, d := range n.disjuncts {
-				if d.defaultMode == isDefault {
-					min = 0
-					break
+		// Annotate disjunctions with whether any of the default disjunctions
+		// was used.
+		for _, d := range n.disjuncts {
+			for i, info := range d.usedDefault[defaultOffset:] {
+				if info.parentMode == isDefault {
+					n.disjunctions[i].parentDefaultUsed = true
 				}
-				if d.protoCount > min {
-					min = d.protoCount
-					minPos = i
+				if info.origMode == isDefault {
+					n.disjunctions[i].childDefaultUsed = true
 				}
 			}
-			if min > 0 {
-				n.disjuncts[minPos].defaultMode = isDefault
-			}
 		}
 
+		// Combine parent and child default markers, considering that a parent
+		// "notDefault" is treated as "maybeDefault" if none of the disjuncts
+		// marked as default remain.
+		//
+		// NOTE for a parent marked as "notDefault", a child is *never*
+		// considered as default. It may either be "not" or "maybe" default.
+		//
+		// The result for each disjunction is conjoined into a single value.
+		for _, d := range n.disjuncts {
+			m := maybeDefault
+			orig := maybeDefault
+			for i, info := range d.usedDefault[defaultOffset:] {
+				parent := info.parentMode
+
+				used := n.disjunctions[i].parentDefaultUsed
+				childUsed := n.disjunctions[i].childDefaultUsed
+				hasDefaults := n.disjunctions[i].hasDefaults
+
+				orig = combineDefault(orig, info.parentMode)
+				orig = combineDefault(orig, info.nestedMode)
+
+				switch {
+				case childUsed:
+					// One of the children used a default. This is "normal"
+					// mode. This may also happen when we are in
+					// hasDefaults/notUsed mode. Consider
+					//
+					//      ("a" | "b") & (*(*"a" | string) | string)
+					//
+					// Here the doubly nested default is called twice, once
+					// for "a" and then for "b", where the second resolves to
+					// not using a default. The first does, however, and on that
+					// basis the "ot default marker cannot be overridden.
+					m = combineDefault(m, info.parentMode)
+					m = combineDefault(m, info.origMode)
+
+				case !hasDefaults, used:
+					m = combineDefault(m, info.parentMode)
+					m = combineDefault(m, info.nestedMode)
+
+				case hasDefaults && !used:
+					Assertf(parent == notDefault, "unexpected default mode")
+				}
+			}
+			d.defaultMode = m
+
+			d.usedDefault = d.usedDefault[:defaultOffset]
+			d.usedDefault = append(d.usedDefault, defaultInfo{
+				parentMode: parentMode,
+				nestedMode: m,
+				origMode:   orig,
+			})
+
+		}
+
+		// TODO: this is an old trick that seems no longer necessary for the new
+		// implementation. Keep around until we finalize the semantics for
+		// defaults, though. The recursion of nested defaults is not entirely
+		// proper yet.
+		//
+		// A better approach, that avoids the need for recursion (semantically),
+		// would be to only consider default usage for one level, but then to
+		// also allow a default to be passed if only one value is remaining.
+		// This means that a nested subsumption would first have to be evaluated
+		// in isolation, however, to determine that it is not previous
+		// disjunctions that cause the disambiguation.
+		//
 		// HACK alert: this replaces the hack of the previous algorithm with a
 		// slightly less worse hack: instead of dropping the default info when
-		// the value was scalar before, we drop this information when there
-		// is only one disjunct, while not discarding hard defaults.
-		// TODO: a more principled approach would be to recognize that there
-		// is only one default at a point where this does not break
-		// commutativity.
-		if len(n.disjuncts) == 1 && n.disjuncts[0].defaultMode != isDefault {
-			n.disjuncts[0].defaultMode = maybeDefault
-		}
+		// the value was scalar before, we drop this information when there is
+		// only one disjunct, while not discarding hard defaults. TODO: a more
+		// principled approach would be to recognize that there is only one
+		// default at a point where this does not break commutativity. if
+		// if len(n.disjuncts) == 1 && n.disjuncts[0].defaultMode != isDefault {
+		// 	n.disjuncts[0].defaultMode = maybeDefault
+		// }
 	}
 
 	// Compare to root, but add to this one.
-	// TODO: if only one value is left, set to maybeDefault.
 	switch p := parent; {
 	case p != n:
 		p.disjunctErrs = append(p.disjunctErrs, n.disjunctErrs...)
@@ -340,17 +364,22 @@
 
 	outer:
 		for _, d := range n.disjuncts {
-			for _, v := range p.disjuncts {
+			for k, v := range p.disjuncts {
 				if Equal(n.ctx, &v.result, &d.result, true) {
-					if d.defaultMode == isDefault {
-						v.defaultMode = isDefault
+					m := maybeDefault
+					for _, u := range d.usedDefault {
+						m = combineDefault(m, u.nestedMode)
 					}
-					d.free()
+					if m == isDefault {
+						p.disjuncts[k] = d
+						v.free()
+					} else {
+						d.free()
+					}
 					continue outer
 				}
 			}
 
-			d.defaultMode = combineDefault(m, d.defaultMode)
 			p.disjuncts = append(p.disjuncts, d)
 		}
 
diff --git a/internal/core/adt/eval.go b/internal/core/adt/eval.go
index 2ebc66f..99ca681 100644
--- a/internal/core/adt/eval.go
+++ b/internal/core/adt/eval.go
@@ -783,16 +783,33 @@
 	hasTop      bool
 	hasCycle    bool // has conjunct with structural cycle
 	hasNonCycle bool // has conjunct without structural cycle
-	protoCount  int32
 
 	// Disjunction handling
 	disjunctions []envDisjunct
+
+	// usedDefault indicates the for each of possibly multiple parent
+	// disjunctions whether it is unified with a default disjunct or not.
+	// This is then later used to determine whether a disjunction should
+	// be treated as a marked disjunction.
+	usedDefault []defaultInfo
+
 	defaultMode  defaultMode
 	disjuncts    []*nodeContext
 	buffer       []*nodeContext
 	disjunctErrs []*Bottom
 }
 
+type defaultInfo struct {
+	// parentMode indicates whether this values was used as a default value,
+	// based on the parent mode.
+	parentMode defaultMode
+
+	// The result of default evaluation for a nested disjunction.
+	nestedMode defaultMode
+
+	origMode defaultMode
+}
+
 func (n *nodeContext) addNotify(v *Vertex) {
 	if v != nil {
 		n.notify = append(n.notify, v)
@@ -833,6 +850,8 @@
 	d.lists = append(d.lists, n.lists...)
 	d.vLists = append(d.vLists, n.vLists...)
 	d.exprs = append(d.exprs, n.exprs...)
+	d.usedDefault = append(d.usedDefault, n.usedDefault...)
+
 	// No need to clone d.disjunctions
 
 	return d
@@ -858,6 +877,7 @@
 			vLists:        n.vLists[:0],
 			exprs:         n.exprs[:0],
 			disjunctions:  n.disjunctions[:0],
+			usedDefault:   n.usedDefault[:0],
 			disjunctErrs:  n.disjunctErrs[:0],
 			disjuncts:     n.disjuncts[:0],
 			buffer:        n.buffer[:0],