1 /* 2 * Copyright 2008 Google Inc. All Rights Reserved. 3 * Copyright 2013-2014 Jan Krüger. All Rights Reserved. 4 * Author: fraser@google.com (Neil Fraser) 5 * Author: anteru@developer.shelter13.net (Matthaeus G. Chajdas) 6 * Author: jan@jandoe.de (Jan Krüger) 7 * 8 * Licensed under the Apache License, Version 2.0 (the "License"); 9 * you may not use this file except in compliance with the License. 10 * You may obtain a copy of the License at 11 * 12 * http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * Unless required by applicable law or agreed to in writing, software 15 * distributed under the License is distributed on an "AS IS" BASIS, 16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 * See the License for the specific language governing permissions and 18 * limitations under the License. 19 * 20 * Diff Match and Patch 21 * http://code.google.com/p/google-diff-match-patch/ 22 */ 23 module ddmp.diff; 24 25 import ddmp.util; 26 27 import std.array; 28 import std.conv; 29 import std.datetime : SysTime, Clock, UTC; 30 import std.exception : enforce; 31 import std..string : indexOf, endsWith, startsWith; 32 import std.uni; 33 import std.regex; 34 import std.algorithm : min, max; 35 import std.digest.sha; 36 import core.time; 37 38 39 Duration diffTimeout = 1.seconds; 40 int DIFF_EDIT_COST = 4; 41 42 43 /** 44 * Compute and return the source text (all equalities and deletions). 45 * @param diffs List of Diff objects. 46 * @return Source text. 47 */ 48 string diff_text1(Diff[] diffs) { 49 auto text = appender!string(); 50 foreach ( d; diffs ) { 51 if (d.operation != Operation.INSERT) { 52 text.put(d.text); 53 } 54 } 55 return text.data(); 56 } 57 58 /** 59 * Compute and return the destination text (all equalities and insertions). 60 * @param diffs List of Diff objects. 61 * @return Destination text. 62 */ 63 string diff_text2(Diff[] diffs) { 64 auto text = appender!string(); 65 foreach ( d; diffs ) { 66 if (d.operation != Operation.DELETE) { 67 text.put(d.text); 68 } 69 } 70 return text.data(); 71 } 72 73 74 /** 75 * Compute the Levenshtein distance; the number of inserted, deleted or 76 * substituted characters. 77 * @param diffs List of Diff objects. 78 * @return Number of changes. 79 */ 80 int levenshtein(Diff[] diffs) { 81 int levenshtein = 0; 82 int insertions = 0; 83 int deletions = 0; 84 foreach ( d ; diffs ) { 85 final switch (d.operation) { 86 case Operation.INSERT: 87 insertions += d.text.length; 88 break; 89 case Operation.DELETE: 90 deletions += d.text.length; 91 break; 92 case Operation.EQUAL: 93 // A deletion and an insertion is one substitution. 94 levenshtein += max(insertions, deletions); 95 insertions = 0; 96 deletions = 0; 97 break; 98 } 99 } 100 levenshtein += max(insertions, deletions); 101 return levenshtein; 102 } 103 104 105 /** 106 * Crush the diff into an encoded string which describes the operations 107 * required to transform text1 into text2. 108 * E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'. 109 * Operations are tab-separated. Inserted text is escaped using %xx 110 * notation. 111 * @param diffs Array of Diff objects. 112 * @return Delta text. 113 */ 114 string toDelta(in Diff[] diffs) 115 { 116 import std.range : walkLength; 117 import std.format : formattedWrite; 118 import std.uri : encode; 119 auto text = appender!string; 120 foreach (aDiff; diffs) { 121 final switch (aDiff.operation) { 122 case Operation.INSERT: 123 text.formattedWrite("+%s\t", encode(aDiff.text).replace("%20", " ")); 124 break; 125 case Operation.DELETE: 126 text.formattedWrite("-%s\t", aDiff.text.walkLength); 127 break; 128 case Operation.EQUAL: 129 text.formattedWrite("=%s\t", aDiff.text.walkLength); 130 break; 131 } 132 } 133 string delta = text.data; 134 if (delta.length != 0) { 135 // Strip off trailing tab character. 136 delta = delta[0 .. $-1]; 137 delta = unescapeForEncodeUriCompatibility(delta); 138 } 139 return delta; 140 } 141 142 /** 143 * Given the original text1, and an encoded string which describes the 144 * operations required to transform text1 into text2, comAdde the full diff. 145 * @param text1 Source string for the diff. 146 * @param delta Delta text. 147 * @return Array of Diff objects or null if invalid. 148 * @throws ArgumentException If invalid input. 149 */ 150 Diff[] fromDelta(string text1, string delta) 151 { 152 import std.algorithm; 153 import std.range; 154 import std..string : format; 155 import std.uri : decodeComponent; 156 157 auto diffs = appender!(Diff[]); 158 foreach (token; delta.splitter("\t")) { 159 if (token.length == 0) { 160 // Blank tokens are ok (from a trailing \t). 161 continue; 162 } 163 // Each token begins with a one character parameter which specifies the 164 // operation of this token (delete, insert, equality). 165 string param = token[1 .. $]; 166 switch (token[0]) { 167 case '+': 168 // decode would change all "+" to " " 169 param = param.replace("+", "%2b"); 170 param = decodeComponent(param); 171 //} catch (UnsupportedEncodingException e) { 172 // // Not likely on modern system. 173 // throw new Error("This system does not support UTF-8.", e); 174 //} catch (IllegalArgumentException e) { 175 // // Malformed URI sequence. 176 // throw new IllegalArgumentException( 177 // "Illegal escape in diff_fromDelta: " + param, e); 178 //} 179 diffs ~= Diff(Operation.INSERT, param); 180 break; 181 case '-': // Fall through. 182 case '=': 183 int n; 184 try { 185 n = param.to!int; 186 } catch (ConvException e) { 187 throw new Exception("Invalid number in diff_fromDelta: " ~ param); 188 } 189 enforce (n >= 0, "Negative number in diff_fromDelta: " ~ param); 190 191 string text; 192 enforce (n <= text1.walkLength, "Delta length larger than source text length."); 193 text = text1.takeExactly(n).array.to!string; 194 text1.popFrontN(n); 195 if (token[0] == '=') { 196 diffs ~= Diff(Operation.EQUAL, text); 197 } else { 198 diffs ~= Diff(Operation.DELETE, text); 199 } 200 break; 201 default: 202 // Anything else is an error. 203 throw new Exception( 204 "Invalid diff operation in diff_fromDelta: " ~ token[0]); 205 } 206 } 207 if (text1.length > 0) 208 throw new Exception("Delta length smaller than source text length."); 209 return diffs.data; 210 } 211 212 struct LinesToCharsResult { 213 string text1; 214 string text2; 215 string[] uniqueStrings; 216 bool opEquals()(auto ref const LinesToCharsResult other) const { 217 return text1 == other.text1 && 218 text2 == other.text2 && 219 uniqueStrings == other.uniqueStrings; 220 } 221 } 222 223 LinesToCharsResult linesToChars(string text1, string text2) 224 { 225 size_t[string] lineHash; 226 LinesToCharsResult res; 227 res.uniqueStrings = [""]; 228 res.text1 = linesToCharsMunge(text1, res.uniqueStrings, lineHash); 229 res.text2 = linesToCharsMunge(text2, res.uniqueStrings, lineHash); 230 return res; 231 } 232 233 string linesToCharsMunge(string text, ref string[] lines, ref size_t[string] linehash) 234 { 235 sizediff_t lineStart = 0; 236 sizediff_t lineEnd = -1; 237 string line; 238 auto chars = appender!string(); 239 while( lineEnd+1 < text.length ){ 240 lineEnd = text.indexOfAlt("\n", lineStart); 241 if( lineEnd == -1 ) lineEnd = text.length - 1; 242 line = text[lineStart..lineEnd + 1]; 243 lineStart = lineEnd + 1; 244 245 if (auto pv = line in linehash) { 246 chars ~= cast(dchar)*pv; 247 } else { 248 lines ~= line; 249 linehash[line] = lines.length - 1; 250 chars ~= cast(dchar)(lines.length -1); 251 } 252 } 253 return chars.data(); 254 } 255 256 void charsToLines(Diff[] diffs, string[] lineArray) 257 { 258 foreach (ref d; diffs) { 259 auto str = appender!string(); 260 foreach (dchar ch; d.text) 261 str.put(lineArray[ch]); 262 d.text = str.data(); 263 } 264 } 265 266 size_t commonPrefix(string text1, string text2) 267 { 268 auto n = min(text1.length, text2.length); 269 foreach (i; 0 .. n) 270 if (text1[i] != text2[i]) 271 return i; 272 return n; 273 } 274 275 size_t commonSuffix(string text1, string text2) 276 { 277 auto n = min(text1.length, text2.length); 278 foreach (i; 1 .. n+1) 279 if (text1[$-i] != text2[$-i]) 280 return i-1; 281 return n; 282 } 283 284 /** 285 * Determine if the suffix of one string is the prefix of another. 286 * @param text1 First string. 287 * @param text2 Second string. 288 * @return The number of characters common to the end of the first 289 * string and the start of the second string. 290 */ 291 size_t commonOverlap(string text1, string text2) { 292 // Cache the text lengths to prevent multiple calls. 293 auto text1_length = text1.length; 294 auto text2_length = text2.length; 295 // Eliminate the null case. 296 if (text1_length == 0 || text2_length == 0) return 0; 297 298 // Truncate the longer string. 299 if (text1_length > text2_length) { 300 text1 = text1[$ - text2_length .. $]; 301 } else if (text1_length < text2_length) { 302 text2 = text2[0 .. text1_length]; 303 } 304 auto text_length = min(text1_length, text2_length); 305 // Quick check for the worst case. 306 if (text1 == text2) { 307 return text_length; 308 } 309 310 // Start by looking for a single character match 311 // and increase length until no match is found. 312 // Performance analysis: http://neil.fraser.name/news/2010/11/04/ 313 int best = 0; 314 int length = 1; 315 while (true) { 316 string pattern = text1[text_length - length .. $]; 317 auto found = text2.indexOf(pattern); 318 if (found == -1) { 319 return best; 320 } 321 length += found; 322 if (found == 0 || text1[text_length - length .. $] == text2[0 .. length]) { 323 best = length; 324 length++; 325 } 326 } 327 } 328 329 /**- 330 * The data structure representing a diff is a List of Diff objects: 331 * {Diff(Operation.DELETE, "Hello"), Diff(Operation.INSERT, "Goodbye"), 332 * Diff(Operation.EQUAL, " world.")} 333 * which means: delete "Hello", add "Goodbye" and keep " world." 334 */ 335 enum Operation { 336 DELETE, 337 INSERT, 338 EQUAL 339 } 340 341 342 /** 343 * Struct representing one diff operation. 344 */ 345 struct Diff { 346 Operation operation; 347 string text; 348 349 this(Operation operation, string text) 350 { 351 this.operation = operation; 352 this.text = text; 353 } 354 355 string toString() 356 { 357 //string prettyText = text.replace('\n', '\u00b6'); 358 string op; 359 final switch(operation) { 360 case Operation.DELETE: 361 op = "DELETE"; break; 362 case Operation.INSERT: 363 op = "INSERT"; break; 364 case Operation.EQUAL: 365 op = "EQUAL"; break; 366 } 367 return "Diff(" ~ op ~ ",\"" ~ text ~ "\")"; 368 } 369 370 bool opEquals(const Diff other) const 371 { 372 return operation == other.operation && text == other.text; 373 } 374 } 375 376 377 /** 378 * Find the differences between two texts. 379 * Run a faster, slightly less optimal diff. 380 * This method allows the 'checklines' of diff_main() to be optional. 381 * Most of the time checklines is wanted, so default to true. 382 * @param text1 Old string to be diffed. 383 * @param text2 New string to be diffed. 384 * @return List of Diff objects. 385 */ 386 Diff[] diff_main(string text1, string text2) 387 { 388 return diff_main(text1, text2, true); 389 } 390 391 /** 392 * Find the differences between two texts. 393 * @param text1 Old string to be diffed. 394 * @param text2 New string to be diffed. 395 * @param checklines Speedup flag. If false, then don't run a 396 * line-level diff first to identify the changed areas. 397 * If true, then run a faster slightly less optimal diff. 398 * @return List of Diff objects. 399 */ 400 Diff[] diff_main(string text1, string text2, bool checklines) 401 { 402 // Set a deadline by which time the diff must be complete. 403 SysTime deadline; 404 if (diffTimeout <= 0.seconds) { 405 deadline = SysTime.max; 406 } else { 407 deadline = Clock.currTime(UTC()) + diffTimeout; 408 } 409 return diff_main(text1, text2, checklines, deadline); 410 } 411 412 /** 413 * Find the differences between two texts. Simplifies the problem by 414 * stripping any common prefix or suffix off the texts before diffing. 415 * @param text1 Old string to be diffed. 416 * @param text2 New string to be diffed. 417 * @param checklines Speedup flag. If false, then don't run a 418 * line-level diff first to identify the changed areas. 419 * If true, then run a faster slightly less optimal diff. 420 * @param deadline Time when the diff should be complete by. Used 421 * internally for recursive calls. Users should set DiffTimeout 422 * instead. 423 * @return List of Diff objects. 424 */ 425 Diff[] diff_main(string text1, string text2, bool checklines, SysTime deadline) 426 { 427 Diff[] diffs; 428 if( text1 == text2 ){ 429 if( text1.length != 0 ) diffs ~= Diff(Operation.EQUAL, text1); 430 return diffs; 431 } 432 433 auto pos = commonPrefix(text1, text2); 434 auto prefix = text1[0 .. pos]; 435 text1 = text1[pos .. $]; 436 text2 = text2[pos .. $]; 437 438 pos = commonSuffix(text1, text2); 439 auto suffix = text1[$ - pos .. $]; 440 text1 = text1[0 .. $ - pos]; 441 text2 = text2[0 .. $ - pos]; 442 443 // Compute the diff on the middle block. 444 diffs = computeDiffs(text1, text2, checklines, deadline); 445 446 // Restore the prefix and suffix. 447 if( prefix.length != 0 ) { 448 diffs.insert(0, [Diff(Operation.EQUAL, prefix)]); 449 } 450 if( suffix.length != 0 ) { 451 diffs ~= Diff(Operation.EQUAL, suffix); 452 } 453 454 cleanupMerge(diffs); 455 return diffs; 456 } 457 458 459 460 struct HalfMatch { 461 string prefix1; 462 string suffix1; 463 string suffix2; 464 string prefix2; 465 string commonMiddle; 466 467 bool opEquals()(auto ref const HalfMatch other) const { 468 return prefix1 == other.prefix1 && 469 suffix1 == other.suffix1 && 470 prefix2 == other.prefix2 && 471 suffix2 == other.suffix2; 472 } 473 } 474 /* 475 * Do the two texts share a Substring which is at least half the length of 476 * the longer text? 477 * This speedup can produce non-minimal diffs. 478 * @param text1 First string. 479 * @param text2 Second string. 480 * @return Five element String array, containing the prefix of text1, the 481 * suffix of text1, the prefix of text2, the suffix of text2 and the 482 * common middle. Or null if there was no match. 483 */ 484 bool halfMatch(string text1, string text2, out HalfMatch halfmatch){ 485 if (diffTimeout <= 0.seconds) { 486 // Don't risk returning a non-optimal diff if we have unlimited time. 487 return false; 488 } 489 string longtext = text1.length > text2.length ? text1 : text2; 490 string shorttext = text1.length > text2.length ? text2 : text1; 491 if( longtext.length < 4 || shorttext.length * 2 < longtext.length ) return false; //pointless 492 HalfMatch hm1; 493 HalfMatch hm2; 494 auto is_hm1 = halfMatchI(longtext, shorttext, (longtext.length + 3) / 4, hm1); 495 auto is_hm2 = halfMatchI(longtext, shorttext, (longtext.length + 1) / 2, hm2); 496 HalfMatch hm; 497 if( !is_hm1 && !is_hm2 ){ 498 return false; 499 } else if( !is_hm2 ){ 500 hm = hm1; 501 } else if( !is_hm1 ){ 502 hm = hm2; 503 } else { 504 hm = hm1.commonMiddle.length > hm2.commonMiddle.length ? hm1 : hm2; 505 } 506 507 if( text1.length > text2.length ) { 508 halfmatch = hm; 509 return true; 510 } 511 halfmatch.prefix1 = hm.prefix2; 512 halfmatch.suffix1 = hm.suffix2; 513 halfmatch.prefix2 = hm.prefix1; 514 halfmatch.suffix2 = hm.suffix1; 515 halfmatch.commonMiddle = hm.commonMiddle; 516 return true; 517 } 518 519 520 bool halfMatchI(string longtext, string shorttext, sizediff_t i, out HalfMatch hm){ 521 auto seed = longtext.substr(i, longtext.length / 4); 522 sizediff_t j = -1; 523 string best_common; 524 string best_longtext_a; 525 string best_longtext_b; 526 string best_shorttext_a; 527 string best_shorttext_b; 528 while( j < cast(sizediff_t)shorttext.length && ( j = shorttext.indexOfAlt(seed, j + 1)) != -1 ){ 529 auto prefixLen = commonPrefix(longtext[i .. $], shorttext[j .. $]); 530 auto suffixLen = commonSuffix(longtext[0 .. i], shorttext[0 .. j]); 531 if( best_common.length < suffixLen + prefixLen ) { 532 best_common = shorttext.substr(j - suffixLen, suffixLen) ~ shorttext.substr(j, prefixLen); 533 best_longtext_a = longtext[0 .. i - suffixLen]; 534 best_longtext_b = longtext[i + prefixLen .. $]; 535 best_shorttext_a = shorttext[0 .. j - suffixLen]; 536 best_shorttext_b = shorttext[j + prefixLen .. $]; 537 } 538 } 539 if( best_common.length * 2 >= longtext.length ) { 540 hm.prefix1 = best_longtext_a; 541 hm.suffix1 = best_longtext_b; 542 hm.prefix2 = best_shorttext_a; 543 hm.suffix2 = best_shorttext_b; 544 hm.commonMiddle = best_common; 545 return true; 546 } else { 547 return false; 548 } 549 } 550 551 552 /** 553 * Find the differences between two texts. Assumes that the texts do not 554 * have any common prefix or suffix. 555 * @param text1 Old string to be diffed. 556 * @param text2 New string to be diffed. 557 * @param checklines Speedup flag. If false, then don't run a 558 * line-level diff first to identify the changed areas. 559 * If true, then run a faster slightly less optimal diff. 560 * @param deadline Time when the diff should be complete by. 561 * @return List of Diff objects. 562 */ 563 Diff[] computeDiffs(string text1, string text2, bool checklines, SysTime deadline) 564 { 565 Diff[] diffs; 566 567 if( text1.length == 0 ){ 568 diffs ~= Diff(Operation.INSERT, text2); 569 return diffs; 570 } 571 if( text2.length == 0 ){ 572 diffs ~= Diff(Operation.DELETE, text1); 573 return diffs; 574 } 575 576 auto longtext = text1.length > text2.length ? text1 : text2; 577 auto shorttext = text1.length > text2.length ? text2 : text1; 578 auto i = longtext.indexOf(shorttext); 579 if( i != -1 ){ 580 Operation op = (text1.length > text2.length) ? Operation.DELETE : Operation.INSERT; 581 diffs ~= Diff(op, longtext[0 .. i]); 582 diffs ~= Diff(Operation.EQUAL, shorttext); 583 diffs ~= Diff(op, longtext[i + shorttext.length .. $]); 584 return diffs; 585 } 586 587 if( shorttext.length == 1 ){ 588 diffs ~= Diff(Operation.DELETE, text1); 589 diffs ~= Diff(Operation.INSERT, text2); 590 return diffs; 591 } 592 HalfMatch hm; 593 auto is_hm = halfMatch(text1, text2, hm); 594 if( is_hm ){ 595 auto diffs_a = diff_main(hm.prefix1, hm.prefix2, checklines, deadline); 596 auto diffs_b = diff_main(hm.suffix1, hm.suffix2, checklines, deadline); 597 598 diffs = diffs_a; 599 diffs ~= Diff(Operation.EQUAL, hm.commonMiddle); 600 diffs ~= diffs_b; 601 return diffs; 602 } 603 604 if( checklines && text1.length > 100 && text2.length > 100 ){ 605 return diff_lineMode(text1, text2, deadline); 606 } 607 608 return bisect(text1, text2, deadline); 609 } 610 611 Diff[] diff_lineMode(string text1, string text2, SysTime deadline) 612 { 613 auto b = linesToChars(text1, text2); 614 615 auto diffs = diff_main(b.text1, b.text2, false, deadline); 616 617 charsToLines(diffs, b.uniqueStrings); 618 cleanupSemantic(diffs); 619 620 diffs ~= Diff(Operation.EQUAL, ""); 621 auto pointer = 0; 622 auto count_delete = 0; 623 auto count_insert = 0; 624 string text_delete; 625 string text_insert; 626 while( pointer < diffs.length ){ 627 final switch( diffs[pointer].operation ) { 628 case Operation.INSERT: 629 count_insert++; 630 text_insert ~= diffs[pointer].text; 631 break; 632 case Operation.DELETE: 633 count_delete++; 634 text_delete ~= diffs[pointer].text; 635 break; 636 case Operation.EQUAL: 637 if( count_delete >= 1 && count_insert >= 1 ){ 638 639 diffs.remove(pointer - count_delete - count_insert, 640 count_delete + count_insert); 641 642 pointer = pointer - count_delete - count_insert; 643 644 auto a = diff_main(text_delete, text_insert, false, deadline); 645 diffs.insert(pointer, a); 646 pointer += a.length; 647 } 648 count_insert = 0; 649 count_delete = 0; 650 text_delete = ""; 651 text_insert = ""; 652 break; 653 } 654 pointer++; 655 } 656 diffs.remove(diffs.length - 1); 657 return diffs; 658 } 659 660 Diff[] bisect(string text1, string text2, SysTime deadline) 661 { 662 auto text1_len = text1.length; 663 auto text2_len = text2.length; 664 auto max_d = (text1_len + text2_len + 1) / 2; 665 auto v_offset = max_d; 666 auto v_len = 2 * max_d; 667 sizediff_t[] v1; 668 sizediff_t[] v2; 669 for( auto x = 0; x < v_len; x++ ){ 670 v1 ~= -1; 671 v2 ~= -1; 672 } 673 v1[v_offset + 1] = 0; 674 v2[v_offset + 1] = 0; 675 auto delta = text1_len - text2_len; 676 bool front = (delta % 2 != 0); 677 auto k1start = 0; 678 auto k1end = 0; 679 auto k2start = 0; 680 auto k2end = 0; 681 for( auto d = 0; d < max_d; d++ ){ 682 // Bail out if deadline is reached. 683 if (Clock.currTime(UTC()) > deadline) { 684 break; 685 } 686 687 for( auto k1 = -d + k1start; k1 <= d - k1end; k1 += 2 ){ 688 auto k1_offset = v_offset + k1; 689 sizediff_t x1; 690 if( k1 == -d || k1 != d && v1[k1_offset - 1] < v1[k1_offset + 1] ) { 691 x1 = v1[k1_offset + 1]; 692 } else { 693 x1 = v1[k1_offset - 1] + 1; 694 } 695 auto y1 = x1 - k1; 696 while( x1 < text1_len && y1 < text2_len && text1[x1] == text2[y1] ){ 697 x1++; 698 y1++; 699 } 700 v1[k1_offset] = x1; 701 if( x1 > text1_len) { 702 k1end += 2; 703 } else if( y1 > text2_len ){ 704 k1start += 2; 705 } else if( front ){ 706 auto k2_offset = v_offset + delta - k1; 707 if( k2_offset >= 0 && k2_offset < v_len && v2[k2_offset] != -1) { 708 auto x2 = text1_len - v2[k2_offset]; 709 if( x1 >= x2 ) return bisectSplit(text1, text2, x1, y1, deadline); 710 } 711 } 712 } 713 for( auto k2 = -d + k2start; k2 <= d - k2end; k2 += 2) { 714 auto k2_offset = v_offset + k2; 715 sizediff_t x2; 716 if (k2 == -d || k2 != d && v2[k2_offset - 1] < v2[k2_offset + 1]) { 717 x2 = v2[k2_offset + 1]; 718 } else { 719 x2 = v2[k2_offset - 1] + 1; 720 } 721 auto y2 = x2 - k2; 722 while( x2 < text1_len && y2 < text2_len 723 && text1[text1_len - x2 - 1] 724 == text2[text2_len - y2 - 1] ){ 725 x2++; 726 y2++; 727 } 728 v2[k2_offset] = x2; 729 if (x2 > text1_len) { 730 // Ran off the left of the graph. 731 k2end += 2; 732 } else if (y2 > text2_len) { 733 // Ran off the top of the graph. 734 k2start += 2; 735 } else if (!front) { 736 auto k1_offset = v_offset + delta - k2; 737 if (k1_offset >= 0 && k1_offset < v_len && v1[k1_offset] != -1) { 738 auto x1 = v1[k1_offset]; 739 auto y1 = v_offset + x1 - k1_offset; 740 // Mirror x2 onto top-left coordinate system. 741 x2 = text1_len - v2[k2_offset]; 742 if (x1 >= x2) { 743 // Overlap detected. 744 return bisectSplit(text1, text2, x1, y1, deadline); 745 } 746 } 747 } 748 } 749 } 750 Diff[] diffs; 751 diffs ~= Diff(Operation.DELETE, text1); 752 diffs ~= Diff(Operation.INSERT, text2); 753 return diffs; 754 } 755 756 757 Diff[] bisectSplit(string text1, string text2, sizediff_t x, sizediff_t y, SysTime deadline) 758 { 759 auto text1a = text1[0 .. x]; 760 auto text2a = text2[0 .. y]; 761 auto text1b = text1[x .. $]; 762 auto text2b = text2[y .. $]; 763 764 Diff[] diffs = diff_main(text1a, text2a, false, deadline); 765 Diff[] diffsb = diff_main(text1b, text2b, false, deadline); 766 diffs ~= diffsb; 767 return diffs; 768 } 769 770 void cleanupSemantic(ref Diff[] diffs) 771 { 772 bool changes = false; 773 size_t[] equalities; 774 775 string last_equality = null; 776 size_t pointer = 0; 777 size_t length_insertions1 = 0; 778 size_t length_deletions1 = 0; 779 size_t length_insertions2 = 0; 780 size_t length_deletions2 = 0; 781 782 while( pointer < diffs.length) { 783 if( diffs[pointer].operation == Operation.EQUAL ){ 784 equalities ~= pointer; 785 length_insertions1 = length_insertions2; 786 length_deletions1 = length_deletions2; 787 length_insertions2 = 0; 788 length_deletions2 = 0; 789 last_equality = diffs[pointer].text; 790 } else { 791 if( diffs[pointer].operation == Operation.INSERT ){ 792 length_insertions2 += diffs[pointer].text.length; 793 } else { 794 length_deletions2 += diffs[pointer].text.length; 795 } 796 797 if( last_equality !is null && 798 (last_equality.length <= max(length_insertions1, length_deletions1)) 799 && (last_equality.length <= max(length_insertions2, length_deletions2))) 800 { 801 // Duplicate record. 802 diffs.insert(equalities[$-1], [Diff(Operation.DELETE, last_equality)]); 803 diffs[equalities[$-1]+1] = Diff(Operation.INSERT, diffs[equalities[$-1]+1].text); 804 805 // Throw away the equality we just deleted. 806 equalities.length--; 807 if (equalities.length > 0) { 808 // Throw away the previous equality (it needs to be reevaluated). 809 equalities.length--; 810 } 811 equalities.assumeSafeAppend(); 812 813 pointer = equalities.length > 0 ? equalities[$-1] : -1; 814 length_insertions1 = 0; 815 length_deletions1 = 0; 816 length_insertions2 = 0; 817 length_deletions2 = 0; 818 last_equality = null; 819 changes = true; 820 } 821 } 822 pointer++; 823 } 824 825 if( changes ) { 826 cleanupMerge(diffs); 827 } 828 cleanupSemanticLossless(diffs); 829 830 // Find any overlaps between deletions and insertions. 831 // e.g: <del>abcxxx</del><ins>xxxdef</ins> 832 // -> <del>abc</del>xxx<ins>def</ins> 833 // e.g: <del>xxxabc</del><ins>defxxx</ins> 834 // -> <ins>def</ins>xxx<del>abc</del> 835 // Only extract an overlap if it is as big as the edit ahead or behind it. 836 837 pointer = 1; 838 while( pointer < diffs.length ){ 839 if( diffs[pointer - 1].operation == Operation.DELETE && 840 diffs[pointer].operation == Operation.INSERT) { 841 auto deletion = diffs[pointer - 1].text; 842 auto insertion = diffs[pointer].text; 843 auto overlap_len1 = commonOverlap(deletion, insertion); 844 auto overlap_len2 = commonOverlap(insertion, deletion); 845 if( overlap_len1 >= overlap_len2 ){ 846 if( overlap_len1 * 2 >= deletion.length || 847 overlap_len1 * 2 >= insertion.length) { 848 //Overlap found. 849 //Insert an equality and trim the surrounding edits. 850 diffs.insert(pointer, [Diff(Operation.EQUAL, insertion[0 .. overlap_len1])]); 851 diffs[pointer - 1].text = deletion[0 .. $ - overlap_len1]; 852 diffs[pointer + 1].text = insertion[overlap_len1 .. $]; 853 pointer++; 854 } 855 } else { 856 if( overlap_len2 * 2 >= deletion.length || 857 overlap_len2 * 2 >= insertion.length) { 858 diffs.insert(pointer, [Diff(Operation.EQUAL, deletion[0 .. overlap_len2])]); 859 860 diffs[pointer - 1].operation = Operation.INSERT; 861 diffs[pointer - 1].text = insertion[0 .. $ - overlap_len2]; 862 diffs[pointer + 1].operation = Operation.DELETE; 863 diffs[pointer + 1].text = deletion[overlap_len2 .. $]; 864 pointer++; 865 } 866 } 867 pointer++; 868 } 869 pointer++; 870 } 871 } 872 873 /** 874 * Look for single edits surrounded on both sides by equalities 875 * which can be shifted sideways to align the edit to a word boundary. 876 * e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came. 877 * @param diffs List of Diff objects. 878 */ 879 void cleanupSemanticLossless(ref Diff[] diffs) 880 { 881 auto pointer = 1; 882 // Intentionally ignore the first and last element (don't need checking). 883 while( pointer < cast(sizediff_t)(diffs.length) - 1 ){ 884 if( diffs[pointer-1].operation == Operation.EQUAL && 885 diffs[pointer+1].operation == Operation.EQUAL) { 886 // This is a single edit surrounded by equalities. 887 auto equality1 = diffs[pointer-1].text; 888 auto edit = diffs[pointer].text; 889 auto equality2 = diffs[pointer+1].text; 890 891 // First, shift the edit as far left as possible 892 auto commonOffset = commonSuffix(equality1, edit); 893 if( commonOffset > 0 ){ 894 auto commonString = edit[$ - commonOffset .. $]; 895 equality1 = equality1[0 .. $ - commonOffset]; 896 edit = commonString ~ edit[0 .. $ - commonOffset]; 897 equality2 = commonString ~ equality2; 898 } 899 900 // Second, step character by character right, 901 // looking for the best fit. 902 auto best_equality1 = equality1; 903 auto best_edit = edit; 904 auto best_equality2 = equality2; 905 auto best_score = cleanupSemanticScore(equality1, edit) + cleanupSemanticScore(edit, equality2); 906 while( edit.length != 0 && equality2.length != 0 && edit[0] == equality2[0]){ 907 equality1 ~= edit[0]; 908 edit = edit[1 .. $] ~ equality2[0]; 909 equality2 = equality2[1 .. $]; 910 auto score = cleanupSemanticScore(equality1, edit) + cleanupSemanticScore(edit, equality2); 911 // The >= encourages trailing rather than leading whitespace on 912 // edits. 913 if (score >= best_score) { 914 best_score = score; 915 best_equality1 = equality1; 916 best_edit = edit; 917 best_equality2 = equality2; 918 } 919 } 920 921 if( diffs[pointer-1].text != best_equality1 ){ 922 // We have an improvement, save it back to the diff. 923 if( best_equality1.length != 0) { 924 diffs[pointer-1].text = best_equality1; 925 } else { 926 diffs.remove(pointer - 1); 927 pointer--; 928 } 929 diffs[pointer].text = best_edit; 930 if( best_equality2.length != 0 ){ 931 diffs[pointer+1].text = best_equality2; 932 } else { 933 diffs.remove(pointer + 1); 934 pointer--; 935 } 936 } 937 } 938 pointer++; 939 } 940 } 941 942 943 944 945 /** 946 * Reorder and merge like edit sections. Merge equalities. 947 * Any edit section can move as sizediff_t as it doesn't cross an equality. 948 * @param diffs List of Diff objects. 949 */ 950 void cleanupMerge(ref Diff[] diffs) { 951 diffs ~= Diff(Operation.EQUAL, ""); 952 size_t pointer = 0; 953 size_t count_delete = 0; 954 size_t count_insert = 0; 955 string text_delete; 956 string text_insert; 957 while(pointer < diffs.length) { 958 final switch(diffs[pointer].operation){ 959 case Operation.INSERT: 960 count_insert++; 961 text_insert ~= diffs[pointer].text; 962 pointer++; 963 break; 964 case Operation.DELETE: 965 count_delete++; 966 text_delete ~= diffs[pointer].text; 967 pointer++; 968 break; 969 case Operation.EQUAL: 970 // Upon reaching an equality, check for prior redundancies. 971 if (count_delete + count_insert > 1) { 972 if (count_delete != 0 && count_insert != 0) { 973 // Factor out any common prefixies. 974 if (auto commonlength = commonPrefix(text_insert, text_delete)) { 975 if (pointer > count_delete + count_insert && 976 diffs[pointer - count_delete - count_insert - 1].operation 977 == Operation.EQUAL) 978 { 979 diffs[pointer - count_delete - count_insert - 1].text 980 ~= text_insert[0 .. commonlength]; 981 } else { 982 diffs.insert(0, [Diff(Operation.EQUAL, text_insert[0 .. commonlength])]); 983 pointer++; 984 } 985 text_insert = text_insert[commonlength .. $]; 986 text_delete = text_delete[commonlength .. $]; 987 } 988 // Factor out any common suffixies. 989 if (auto commonlength = commonSuffix(text_insert, text_delete)) { 990 diffs[pointer].text = text_insert[$ - commonlength .. $] ~ diffs[pointer].text; 991 text_insert = text_insert[0 .. $ - commonlength]; 992 text_delete = text_delete[0 .. $ - commonlength]; 993 } 994 } 995 // Delete the offending records and add the merged ones. 996 if (count_delete == 0) { 997 diffs.splice(pointer - count_insert, count_delete + count_insert, [Diff(Operation.INSERT, text_insert)]); 998 } else if (count_insert == 0) { 999 1000 diffs.splice(pointer - count_delete, count_delete + count_insert, [Diff(Operation.DELETE, text_delete)]); 1001 } else { 1002 diffs.splice(pointer - count_delete - count_insert, count_delete + count_insert, [Diff(Operation.DELETE, text_delete), Diff(Operation.INSERT, text_insert)]); 1003 } 1004 pointer = pointer - count_delete - count_insert + 1005 (count_delete != 0 ? 1 : 0) + (count_insert != 0 ? 1 : 0) + 1; 1006 } else if( pointer != 0 && diffs[pointer-1].operation == Operation.EQUAL ){ 1007 diffs[pointer - 1].text ~= diffs[pointer].text; 1008 diffs.remove(pointer); 1009 } else { 1010 pointer++; 1011 } 1012 count_insert = 0; 1013 count_delete = 0; 1014 text_delete = ""; 1015 text_insert = ""; 1016 break; 1017 } 1018 } 1019 if( diffs[$-1].text.length == 0){ 1020 diffs.length--; 1021 } 1022 1023 bool changes = false; 1024 pointer = 1; 1025 while( pointer + 1 < diffs.length ) { 1026 if( diffs[pointer - 1].operation == Operation.EQUAL && 1027 diffs[pointer + 1].operation == Operation.EQUAL) 1028 { 1029 if( diffs[pointer].text.endsWith(diffs[pointer - 1].text)) { 1030 diffs[pointer].text = diffs[pointer - 1].text ~ diffs[pointer].text[0 .. $ - diffs[pointer - 1].text.length]; 1031 diffs[pointer + 1].text = diffs[pointer - 1].text ~ diffs[pointer + 1].text; 1032 diffs.splice(pointer - 1, 1); 1033 changes = true; 1034 } else if( diffs[pointer].text.startsWith(diffs[pointer + 1].text)) { 1035 diffs[pointer - 1].text ~= diffs[pointer + 1].text; 1036 diffs[pointer].text = 1037 diffs[pointer].text[diffs[pointer + 1].text.length .. $] 1038 ~ diffs[pointer + 1].text; 1039 diffs.splice(pointer + 1, 1); 1040 changes = true; 1041 } 1042 } 1043 pointer++; 1044 } 1045 if( changes ) cleanupMerge(diffs); 1046 1047 } 1048 1049 1050 1051 /** 1052 * Given two strings, comAdde a score representing whether the internal 1053 * boundary falls on logical boundaries. 1054 * Scores range from 6 (best) to 0 (worst). 1055 * @param one First string. 1056 * @param two Second string. 1057 * @return The score. 1058 */ 1059 int cleanupSemanticScore(string one, string two) 1060 { 1061 if( one.length == 0 || two.length == 0) return 6; //Edges are the best 1062 auto char1 = one[$-1]; 1063 auto char2 = two[0]; 1064 1065 auto nonAlphaNumeric1 = !(isAlpha(char1) || isNumber(char1)); 1066 auto nonAlphaNumeric2 = !(isAlpha(char2) || isNumber(char2)); 1067 auto whitespace1 = nonAlphaNumeric1 && isWhite(char1); 1068 auto whitespace2 = nonAlphaNumeric2 && isWhite(char2); 1069 auto lineBreak1 = whitespace1 && isControl(char1); 1070 auto lineBreak2 = whitespace2 && isControl(char2); 1071 auto blankLine1 = lineBreak1 && match(one, `\n\r?\n\Z`); 1072 auto blankLine2 = lineBreak2 && match(two, `\A\r?\n\r?\n`); 1073 1074 if (blankLine1 || blankLine2) return 5; 1075 else if (lineBreak1 || lineBreak2) return 4; 1076 else if (nonAlphaNumeric1 && !whitespace1 && whitespace2) return 3; 1077 else if (whitespace1 || whitespace2) return 2; 1078 else if (nonAlphaNumeric1 || nonAlphaNumeric2) return 1; 1079 1080 return 0; 1081 } 1082 1083 1084 /** 1085 * Reduce the number of edits by eliminating operationally trivial 1086 * equalities. 1087 * @param diffs List of Diff objects. 1088 */ 1089 void cleanupEfficiency(ref Diff[] diffs) { 1090 bool changes = false; 1091 size_t[] equalities; 1092 string lastequality; 1093 size_t pointer = 0; 1094 auto pre_ins = false; 1095 auto pre_del = false; 1096 auto post_ins = false; 1097 auto post_del = false; 1098 while( pointer < diffs.length ){ 1099 if( diffs[pointer].operation == Operation.EQUAL ){ 1100 if( diffs[pointer].text.length < DIFF_EDIT_COST && (post_ins || post_del)) { 1101 equalities ~= pointer; 1102 pre_ins = post_ins; 1103 pre_del = post_del; 1104 lastequality = diffs[pointer].text; 1105 } else { 1106 equalities.length = 0; 1107 equalities.assumeSafeAppend; 1108 lastequality = ""; 1109 } 1110 post_ins = false; 1111 post_del = false; 1112 } else { 1113 if( diffs[pointer].operation == Operation.DELETE ){ 1114 post_del = true; 1115 } else { 1116 post_ins = true; 1117 } 1118 1119 if( lastequality.length != 0 1120 && ( 1121 (pre_ins && pre_del && post_ins && post_del) 1122 || ((lastequality.length < DIFF_EDIT_COST / 2) 1123 && ((pre_ins ? 1 : 0) + (pre_del ? 1 : 0) + (post_ins ? 1 : 0) + (post_del ? 1 : 0)) == 3) 1124 ) 1125 ) 1126 { 1127 diffs.insert(equalities[$-1], [Diff(Operation.DELETE, lastequality)]); 1128 diffs[equalities[$-1] + 1].operation = Operation.INSERT; 1129 equalities.length--; 1130 equalities.assumeSafeAppend; 1131 lastequality = ""; 1132 if( pre_ins && pre_del ){ 1133 post_ins = true; 1134 post_del = true; 1135 equalities.length = 0; 1136 equalities.assumeSafeAppend; 1137 } else { 1138 if( equalities.length > 0 ) { 1139 equalities.length--; 1140 equalities.assumeSafeAppend; 1141 } 1142 1143 pointer = equalities.length > 0 ? equalities[$-1] : -1; 1144 post_ins = false; 1145 post_del = false; 1146 } 1147 changes = true; 1148 } 1149 } 1150 pointer++; 1151 } 1152 1153 if( changes ){ 1154 cleanupMerge(diffs); 1155 } 1156 } 1157 1158 /** 1159 * loc is a location in text1, comAdde and return the equivalent location in 1160 * text2. 1161 * e.g. "The cat" vs "The big cat", 1->1, 5->8 1162 * @param diffs List of Diff objects. 1163 * @param loc Location within text1. 1164 * @return Location within text2. 1165 */ 1166 sizediff_t xIndex(Diff[] diffs, sizediff_t loc){ 1167 auto chars1 = 0; 1168 auto chars2 = 0; 1169 auto last_chars1 = 0; 1170 auto last_chars2 = 0; 1171 Diff lastDiff; 1172 foreach ( diff; diffs) { 1173 if (diff.operation != Operation.INSERT) { 1174 // Equality or deletion. 1175 chars1 += diff.text.length; 1176 } 1177 if (diff.operation != Operation.DELETE) { 1178 // Equality or insertion. 1179 chars2 += diff.text.length; 1180 } 1181 if (chars1 > loc) { 1182 // Overshot the location. 1183 lastDiff = diff; 1184 break; 1185 } 1186 last_chars1 = chars1; 1187 last_chars2 = chars2; 1188 } 1189 if (lastDiff.operation == Operation.DELETE) { 1190 // The location was deleted. 1191 return last_chars2; 1192 } 1193 // Add the remaining character length. 1194 return last_chars2 + (loc - last_chars1); 1195 } 1196 1197 /** 1198 * Unescape selected chars for compatability with JavaScript's encodeURI. 1199 * In speed critical applications this could be dropped since the 1200 * receiving application will certainly decode these fine. 1201 * Note that this function is case-sensitive. Thus "%3F" would not be 1202 * unescaped. But this is ok because it is only called with the output of 1203 * HttpUtility.UrlEncode which returns lowercase hex. 1204 * 1205 * Example: "%3f" -> "?", "%24" -> "$", etc. 1206 * 1207 * @param str The string to escape. 1208 * @return The escaped string. 1209 */ 1210 public static string unescapeForEncodeUriCompatibility(string str) 1211 { 1212 // FIXME: this is ridiculously inefficient 1213 return str.replace("%21", "!").replace("%7e", "~") 1214 .replace("%27", "'").replace("%28", "(").replace("%29", ")") 1215 .replace("%3b", ";").replace("%2f", "/").replace("%3f", "?") 1216 .replace("%3a", ":").replace("%40", "@").replace("%26", "&") 1217 .replace("%3d", "=").replace("%2b", "+").replace("%24", "$") 1218 .replace("%2c", ",").replace("%23", "#"); 1219 }