1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
|
// Copyright 2017, The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package diff implements an algorithm for producing edit-scripts.
// The edit-script is a sequence of operations needed to transform one list
// of symbols into another (or vice-versa). The edits allowed are insertions,
// deletions, and modifications. The summation of all edits is called the
// Levenshtein distance as this problem is well-known in computer science.
//
// This package prioritizes performance over accuracy. That is, the run time
// is more important than obtaining a minimal Levenshtein distance.
package diff
import (
"math/rand"
"time"
"github.com/google/go-cmp/cmp/internal/flags"
)
// EditType represents a single operation within an edit-script.
type EditType uint8
const (
// Identity indicates that a symbol pair is identical in both list X and Y.
Identity EditType = iota
// UniqueX indicates that a symbol only exists in X and not Y.
UniqueX
// UniqueY indicates that a symbol only exists in Y and not X.
UniqueY
// Modified indicates that a symbol pair is a modification of each other.
Modified
)
// EditScript represents the series of differences between two lists.
type EditScript []EditType
// String returns a human-readable string representing the edit-script where
// Identity, UniqueX, UniqueY, and Modified are represented by the
// '.', 'X', 'Y', and 'M' characters, respectively.
func (es EditScript) String() string {
b := make([]byte, len(es))
for i, e := range es {
switch e {
case Identity:
b[i] = '.'
case UniqueX:
b[i] = 'X'
case UniqueY:
b[i] = 'Y'
case Modified:
b[i] = 'M'
default:
panic("invalid edit-type")
}
}
return string(b)
}
// stats returns a histogram of the number of each type of edit operation.
func (es EditScript) stats() (s struct{ NI, NX, NY, NM int }) {
for _, e := range es {
switch e {
case Identity:
s.NI++
case UniqueX:
s.NX++
case UniqueY:
s.NY++
case Modified:
s.NM++
default:
panic("invalid edit-type")
}
}
return
}
// Dist is the Levenshtein distance and is guaranteed to be 0 if and only if
// lists X and Y are equal.
func (es EditScript) Dist() int { return len(es) - es.stats().NI }
// LenX is the length of the X list.
func (es EditScript) LenX() int { return len(es) - es.stats().NY }
// LenY is the length of the Y list.
func (es EditScript) LenY() int { return len(es) - es.stats().NX }
// EqualFunc reports whether the symbols at indexes ix and iy are equal.
// When called by Difference, the index is guaranteed to be within nx and ny.
type EqualFunc func(ix int, iy int) Result
// Result is the result of comparison.
// NumSame is the number of sub-elements that are equal.
// NumDiff is the number of sub-elements that are not equal.
type Result struct{ NumSame, NumDiff int }
// BoolResult returns a Result that is either Equal or not Equal.
func BoolResult(b bool) Result {
if b {
return Result{NumSame: 1} // Equal, Similar
} else {
return Result{NumDiff: 2} // Not Equal, not Similar
}
}
// Equal indicates whether the symbols are equal. Two symbols are equal
// if and only if NumDiff == 0. If Equal, then they are also Similar.
func (r Result) Equal() bool { return r.NumDiff == 0 }
// Similar indicates whether two symbols are similar and may be represented
// by using the Modified type. As a special case, we consider binary comparisons
// (i.e., those that return Result{1, 0} or Result{0, 1}) to be similar.
//
// The exact ratio of NumSame to NumDiff to determine similarity may change.
func (r Result) Similar() bool {
// Use NumSame+1 to offset NumSame so that binary comparisons are similar.
return r.NumSame+1 >= r.NumDiff
}
var randBool = rand.New(rand.NewSource(time.Now().Unix())).Intn(2) == 0
// Difference reports whether two lists of lengths nx and ny are equal
// given the definition of equality provided as f.
//
// This function returns an edit-script, which is a sequence of operations
// needed to convert one list into the other. The following invariants for
// the edit-script are maintained:
// - eq == (es.Dist()==0)
// - nx == es.LenX()
// - ny == es.LenY()
//
// This algorithm is not guaranteed to be an optimal solution (i.e., one that
// produces an edit-script with a minimal Levenshtein distance). This algorithm
// favors performance over optimality. The exact output is not guaranteed to
// be stable and may change over time.
func Difference(nx, ny int, f EqualFunc) (es EditScript) {
// This algorithm is based on traversing what is known as an "edit-graph".
// See Figure 1 from "An O(ND) Difference Algorithm and Its Variations"
// by Eugene W. Myers. Since D can be as large as N itself, this is
// effectively O(N^2). Unlike the algorithm from that paper, we are not
// interested in the optimal path, but at least some "decent" path.
//
// For example, let X and Y be lists of symbols:
// X = [A B C A B B A]
// Y = [C B A B A C]
//
// The edit-graph can be drawn as the following:
// A B C A B B A
// ┌─────────────┐
// C │_|_|\|_|_|_|_│ 0
// B │_|\|_|_|\|\|_│ 1
// A │\|_|_|\|_|_|\│ 2
// B │_|\|_|_|\|\|_│ 3
// A │\|_|_|\|_|_|\│ 4
// C │ | |\| | | | │ 5
// └─────────────┘ 6
// 0 1 2 3 4 5 6 7
//
// List X is written along the horizontal axis, while list Y is written
// along the vertical axis. At any point on this grid, if the symbol in
// list X matches the corresponding symbol in list Y, then a '\' is drawn.
// The goal of any minimal edit-script algorithm is to find a path from the
// top-left corner to the bottom-right corner, while traveling through the
// fewest horizontal or vertical edges.
// A horizontal edge is equivalent to inserting a symbol from list X.
// A vertical edge is equivalent to inserting a symbol from list Y.
// A diagonal edge is equivalent to a matching symbol between both X and Y.
// Invariants:
// - 0 ≤ fwdPath.X ≤ (fwdFrontier.X, revFrontier.X) ≤ revPath.X ≤ nx
// - 0 ≤ fwdPath.Y ≤ (fwdFrontier.Y, revFrontier.Y) ≤ revPath.Y ≤ ny
//
// In general:
// - fwdFrontier.X < revFrontier.X
// - fwdFrontier.Y < revFrontier.Y
//
// Unless, it is time for the algorithm to terminate.
fwdPath := path{+1, point{0, 0}, make(EditScript, 0, (nx+ny)/2)}
revPath := path{-1, point{nx, ny}, make(EditScript, 0)}
fwdFrontier := fwdPath.point // Forward search frontier
revFrontier := revPath.point // Reverse search frontier
// Search budget bounds the cost of searching for better paths.
// The longest sequence of non-matching symbols that can be tolerated is
// approximately the square-root of the search budget.
searchBudget := 4 * (nx + ny) // O(n)
// Running the tests with the "cmp_debug" build tag prints a visualization
// of the algorithm running in real-time. This is educational for
// understanding how the algorithm works. See debug_enable.go.
f = debug.Begin(nx, ny, f, &fwdPath.es, &revPath.es)
// The algorithm below is a greedy, meet-in-the-middle algorithm for
// computing sub-optimal edit-scripts between two lists.
//
// The algorithm is approximately as follows:
// - Searching for differences switches back-and-forth between
// a search that starts at the beginning (the top-left corner), and
// a search that starts at the end (the bottom-right corner).
// The goal of the search is connect with the search
// from the opposite corner.
// - As we search, we build a path in a greedy manner,
// where the first match seen is added to the path (this is sub-optimal,
// but provides a decent result in practice). When matches are found,
// we try the next pair of symbols in the lists and follow all matches
// as far as possible.
// - When searching for matches, we search along a diagonal going through
// through the "frontier" point. If no matches are found,
// we advance the frontier towards the opposite corner.
// - This algorithm terminates when either the X coordinates or the
// Y coordinates of the forward and reverse frontier points ever intersect.
// This algorithm is correct even if searching only in the forward direction
// or in the reverse direction. We do both because it is commonly observed
// that two lists commonly differ because elements were added to the front
// or end of the other list.
//
// Non-deterministically start with either the forward or reverse direction
// to introduce some deliberate instability so that we have the flexibility
// to change this algorithm in the future.
if flags.Deterministic || randBool {
goto forwardSearch
} else {
goto reverseSearch
}
forwardSearch:
{
// Forward search from the beginning.
if fwdFrontier.X >= revFrontier.X || fwdFrontier.Y >= revFrontier.Y || searchBudget == 0 {
goto finishSearch
}
for stop1, stop2, i := false, false, 0; !(stop1 && stop2) && searchBudget > 0; i++ {
// Search in a diagonal pattern for a match.
z := zigzag(i)
p := point{fwdFrontier.X + z, fwdFrontier.Y - z}
switch {
case p.X >= revPath.X || p.Y < fwdPath.Y:
stop1 = true // Hit top-right corner
case p.Y >= revPath.Y || p.X < fwdPath.X:
stop2 = true // Hit bottom-left corner
case f(p.X, p.Y).Equal():
// Match found, so connect the path to this point.
fwdPath.connect(p, f)
fwdPath.append(Identity)
// Follow sequence of matches as far as possible.
for fwdPath.X < revPath.X && fwdPath.Y < revPath.Y {
if !f(fwdPath.X, fwdPath.Y).Equal() {
break
}
fwdPath.append(Identity)
}
fwdFrontier = fwdPath.point
stop1, stop2 = true, true
default:
searchBudget-- // Match not found
}
debug.Update()
}
// Advance the frontier towards reverse point.
if revPath.X-fwdFrontier.X >= revPath.Y-fwdFrontier.Y {
fwdFrontier.X++
} else {
fwdFrontier.Y++
}
goto reverseSearch
}
reverseSearch:
{
// Reverse search from the end.
if fwdFrontier.X >= revFrontier.X || fwdFrontier.Y >= revFrontier.Y || searchBudget == 0 {
goto finishSearch
}
for stop1, stop2, i := false, false, 0; !(stop1 && stop2) && searchBudget > 0; i++ {
// Search in a diagonal pattern for a match.
z := zigzag(i)
p := point{revFrontier.X - z, revFrontier.Y + z}
switch {
case fwdPath.X >= p.X || revPath.Y < p.Y:
stop1 = true // Hit bottom-left corner
case fwdPath.Y >= p.Y || revPath.X < p.X:
stop2 = true // Hit top-right corner
case f(p.X-1, p.Y-1).Equal():
// Match found, so connect the path to this point.
revPath.connect(p, f)
revPath.append(Identity)
// Follow sequence of matches as far as possible.
for fwdPath.X < revPath.X && fwdPath.Y < revPath.Y {
if !f(revPath.X-1, revPath.Y-1).Equal() {
break
}
revPath.append(Identity)
}
revFrontier = revPath.point
stop1, stop2 = true, true
default:
searchBudget-- // Match not found
}
debug.Update()
}
// Advance the frontier towards forward point.
if revFrontier.X-fwdPath.X >= revFrontier.Y-fwdPath.Y {
revFrontier.X--
} else {
revFrontier.Y--
}
goto forwardSearch
}
finishSearch:
// Join the forward and reverse paths and then append the reverse path.
fwdPath.connect(revPath.point, f)
for i := len(revPath.es) - 1; i >= 0; i-- {
t := revPath.es[i]
revPath.es = revPath.es[:i]
fwdPath.append(t)
}
debug.Finish()
return fwdPath.es
}
type path struct {
dir int // +1 if forward, -1 if reverse
point // Leading point of the EditScript path
es EditScript
}
// connect appends any necessary Identity, Modified, UniqueX, or UniqueY types
// to the edit-script to connect p.point to dst.
func (p *path) connect(dst point, f EqualFunc) {
if p.dir > 0 {
// Connect in forward direction.
for dst.X > p.X && dst.Y > p.Y {
switch r := f(p.X, p.Y); {
case r.Equal():
p.append(Identity)
case r.Similar():
p.append(Modified)
case dst.X-p.X >= dst.Y-p.Y:
p.append(UniqueX)
default:
p.append(UniqueY)
}
}
for dst.X > p.X {
p.append(UniqueX)
}
for dst.Y > p.Y {
p.append(UniqueY)
}
} else {
// Connect in reverse direction.
for p.X > dst.X && p.Y > dst.Y {
switch r := f(p.X-1, p.Y-1); {
case r.Equal():
p.append(Identity)
case r.Similar():
p.append(Modified)
case p.Y-dst.Y >= p.X-dst.X:
p.append(UniqueY)
default:
p.append(UniqueX)
}
}
for p.X > dst.X {
p.append(UniqueX)
}
for p.Y > dst.Y {
p.append(UniqueY)
}
}
}
func (p *path) append(t EditType) {
p.es = append(p.es, t)
switch t {
case Identity, Modified:
p.add(p.dir, p.dir)
case UniqueX:
p.add(p.dir, 0)
case UniqueY:
p.add(0, p.dir)
}
debug.Update()
}
type point struct{ X, Y int }
func (p *point) add(dx, dy int) { p.X += dx; p.Y += dy }
// zigzag maps a consecutive sequence of integers to a zig-zag sequence.
//
// [0 1 2 3 4 5 ...] => [0 -1 +1 -2 +2 ...]
func zigzag(x int) int {
if x&1 != 0 {
x = ^x
}
return x >> 1
}
|