1 /* 2 * Copyright 2008 Google Inc. All Rights Reserved. 3 * Copyright 2013-2014 Jan Krüger. All Rights Reserved. 4 * Author: fraser@google.com (Neil Fraser) 5 * Author: anteru@developer.shelter13.net (Matthaeus G. Chajdas) 6 * Author: jan@jandoe.de (Jan Krüger) 7 * 8 * Licensed under the Apache License, Version 2.0 (the "License"); 9 * you may not use this file except in compliance with the License. 10 * You may obtain a copy of the License at 11 * 12 * http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * Unless required by applicable law or agreed to in writing, software 15 * distributed under the License is distributed on an "AS IS" BASIS, 16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 * See the License for the specific language governing permissions and 18 * limitations under the License. 19 * 20 * Diff Match and Patch 21 * http://code.google.com/p/google-diff-match-patch/ 22 */ 23 module ddmp.diff; 24 25 import ddmp.util; 26 27 import std.array; 28 import std.conv; 29 import std.datetime : SysTime, Clock, UTC; 30 import std.exception : enforce; 31 import std..string : indexOf, endsWith, startsWith; 32 import std.uni; 33 import std.regex; 34 import std.algorithm : min, max; 35 import std.digest.sha; 36 import core.time; 37 38 39 Duration diffTimeout = 1.seconds; 40 int DIFF_EDIT_COST = 4; 41 42 43 /** 44 * Compute and return the source text (all equalities and deletions). 45 * @param diffs List of Diff objects. 46 * @return Source text. 47 */ 48 string diff_text1(Diff[] diffs) { 49 auto text = appender!string(); 50 foreach ( d; diffs ) { 51 if (d.operation != Operation.INSERT) { 52 text.put(d.text); 53 } 54 } 55 return text.data(); 56 } 57 58 /** 59 * Compute and return the destination text (all equalities and insertions). 60 * @param diffs List of Diff objects. 61 * @return Destination text. 62 */ 63 string diff_text2(Diff[] diffs) { 64 auto text = appender!string(); 65 foreach ( d; diffs ) { 66 if (d.operation != Operation.DELETE) { 67 text.put(d.text); 68 } 69 } 70 return text.data(); 71 } 72 73 74 /** 75 * Compute the Levenshtein distance; the number of inserted, deleted or 76 * substituted characters. 77 * @param diffs List of Diff objects. 78 * @return Number of changes. 79 */ 80 int levenshtein(Diff[] diffs) { 81 int levenshtein = 0; 82 int insertions = 0; 83 int deletions = 0; 84 foreach ( d ; diffs ) { 85 final switch (d.operation) { 86 case Operation.INSERT: 87 insertions += d.text.length; 88 break; 89 case Operation.DELETE: 90 deletions += d.text.length; 91 break; 92 case Operation.EQUAL: 93 // A deletion and an insertion is one substitution. 94 levenshtein += max(insertions, deletions); 95 insertions = 0; 96 deletions = 0; 97 break; 98 } 99 } 100 levenshtein += max(insertions, deletions); 101 return levenshtein; 102 } 103 104 105 /** 106 * Crush the diff into an encoded string which describes the operations 107 * required to transform text1 into text2. 108 * E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'. 109 * Operations are tab-separated. Inserted text is escaped using %xx 110 * notation. 111 * @param diffs Array of Diff objects. 112 * @return Delta text. 113 */ 114 string toDelta(in Diff[] diffs) 115 { 116 import std.format : formattedWrite; 117 import std.uri : encode; 118 auto text = appender!string; 119 foreach (aDiff; diffs) { 120 final switch (aDiff.operation) { 121 case Operation.INSERT: 122 text.formattedWrite("+%s\t", encode(aDiff.text).replace("+", " ")); 123 break; 124 case Operation.DELETE: 125 text.formattedWrite("-%s\t", aDiff.text.length); 126 break; 127 case Operation.EQUAL: 128 text.formattedWrite("=%s\t", aDiff.text.length); 129 break; 130 } 131 } 132 string delta = text.data; 133 if (delta.length != 0) { 134 // Strip off trailing tab character. 135 delta = delta[0 .. $-1]; 136 delta = unescapeForEncodeUriCompatability(delta); 137 } 138 return delta; 139 } 140 141 /** 142 * Given the original text1, and an encoded string which describes the 143 * operations required to transform text1 into text2, comAdde the full diff. 144 * @param text1 Source string for the diff. 145 * @param delta Delta text. 146 * @return Array of Diff objects or null if invalid. 147 * @throws ArgumentException If invalid input. 148 */ 149 Diff[] fromDelta(string text1, string delta) 150 { 151 import std..string : format; 152 import std.uri : decode; 153 154 auto diffs = appender!(Diff[]); 155 int pointer = 0; // Cursor in text1 156 foreach (token; delta.splitter("\t")) { 157 if (token.length == 0) { 158 // Blank tokens are ok (from a trailing \t). 159 continue; 160 } 161 // Each token begins with a one character parameter which specifies the 162 // operation of this token (delete, insert, equality). 163 string param = token[1 .. $]; 164 switch (token[0]) { 165 case '+': 166 // decode would change all "+" to " " 167 param = param.replace("+", "%2b"); 168 param = decode(param); 169 //} catch (UnsupportedEncodingException e) { 170 // // Not likely on modern system. 171 // throw new Error("This system does not support UTF-8.", e); 172 //} catch (IllegalArgumentException e) { 173 // // Malformed URI sequence. 174 // throw new IllegalArgumentException( 175 // "Illegal escape in diff_fromDelta: " + param, e); 176 //} 177 diffs ~= Diff(Operation.INSERT, param); 178 break; 179 case '-': // Fall through. 180 case '=': 181 int n; 182 try { 183 n = param.to!int; 184 } catch (ConvException e) { 185 throw new Exception("Invalid number in diff_fromDelta: " ~ param); 186 } 187 enforce (n >= 0, "Negative number in diff_fromDelta: " ~ param); 188 189 string text; 190 enforce (pointer + n <= text1.length, 191 format("Delta length (%s) larger than source text length (%s).", 192 pointer, text1.length)); 193 text = text1[pointer .. pointer+n]; 194 pointer += n; 195 if (token[0] == '=') { 196 diffs ~= Diff(Operation.EQUAL, text); 197 } else { 198 diffs ~= Diff(Operation.DELETE, text); 199 } 200 break; 201 default: 202 // Anything else is an error. 203 throw new Exception( 204 "Invalid diff operation in diff_fromDelta: " ~ token[0]); 205 } 206 } 207 if (pointer != text1.length) 208 throw new Exception(format("Delta length (´%s) smaller than source text length (%s).", pointer, text1.length)); 209 return diffs.data; 210 } 211 212 struct LinesToCharsResult { 213 string text1; 214 string text2; 215 string[] uniqueStrings; 216 bool opEquals()(auto ref const LinesToCharsResult other) const { 217 return text1 == other.text1 && 218 text2 == other.text2 && 219 uniqueStrings == other.uniqueStrings; 220 } 221 } 222 223 LinesToCharsResult linesToChars(string text1, string text2) 224 { 225 LinesToCharsResult res; 226 int[string] lineHash; 227 res.uniqueStrings ~= ""; 228 res.text1 = linesToCharsMunge(text1, res.uniqueStrings, lineHash); 229 res.text2 = linesToCharsMunge(text2, res.uniqueStrings, lineHash); 230 return res; 231 } 232 233 string linesToCharsMunge(string text, ref string[] lines, ref int[string] linehash) 234 { 235 sizediff_t lineStart = 0; 236 sizediff_t lineEnd = -1; 237 string line; 238 auto chars = appender!string(); 239 while( lineEnd < cast(sizediff_t)text.length - 1 ){ 240 lineEnd = text.indexOfAlt("\n", lineStart); 241 if( lineEnd == -1 ) lineEnd = text.length - 1; 242 line = text[lineStart..lineEnd + 1]; 243 lineStart = lineEnd + 1; 244 245 auto pv = line in linehash; 246 if( pv ) { 247 chars ~= cast(char)*pv; 248 } else { 249 lines ~= line; 250 linehash[line] = lines.length - 1; 251 chars ~= cast(char)(lines.length -1); 252 } 253 } 254 return chars.data(); 255 } 256 257 void charsToLines(ref Diff[] diffs, string[] lineArray) 258 { 259 foreach( d ; diffs){ 260 auto str = appender!string(); 261 for( auto y = 0; y < d.text.length; y++) { 262 str.put(lineArray[d.text[y]]); 263 } 264 d.text = str.data(); 265 } 266 } 267 268 int commonPrefix(string text1, string text2) 269 { 270 auto n = min(text1.length, text2.length); 271 for( auto i = 0; i < n; ++i ){ 272 if( text1[i] != text2[i] ) return i; 273 } 274 return n; 275 } 276 277 int commonSuffix(string text1, string text2) 278 { 279 auto len1 = text1.length; 280 auto len2 = text2.length; 281 auto n = min(len1, len2); 282 283 for( auto i = 1; i <= n; i++ ){ 284 if( text1[len1 - i] != text2[len2 - i] ) return i - 1; 285 } 286 return n; 287 } 288 289 /** 290 * Determine if the suffix of one string is the prefix of another. 291 * @param text1 First string. 292 * @param text2 Second string. 293 * @return The number of characters common to the end of the first 294 * string and the start of the second string. 295 */ 296 int commonOverlap(string text1, string text2) { 297 // Cache the text lengths to prevent multiple calls. 298 int text1_length = text1.length; 299 int text2_length = text2.length; 300 // Eliminate the null case. 301 if (text1_length == 0 || text2_length == 0) return 0; 302 303 // Truncate the longer string. 304 if (text1_length > text2_length) { 305 text1 = text1.substr(text1_length - text2_length); 306 } else if (text1_length < text2_length) { 307 text2 = text2.substr(0, text1_length); 308 } 309 int text_length = min(text1_length, text2_length); 310 // Quick check for the worst case. 311 if (text1 == text2) { 312 return text_length; 313 } 314 315 // Start by looking for a single character match 316 // and increase length until no match is found. 317 // Performance analysis: http://neil.fraser.name/news/2010/11/04/ 318 int best = 0; 319 int length = 1; 320 while (true) { 321 string pattern = text1.substr(text_length - length); 322 int found = text2.indexOf(pattern); 323 if (found == -1) { 324 return best; 325 } 326 length += found; 327 if (found == 0 || text1.substr(text_length - length) == text2.substr(0, length)) { 328 best = length; 329 length++; 330 } 331 } 332 } 333 334 /**- 335 * The data structure representing a diff is a List of Diff objects: 336 * {Diff(Operation.DELETE, "Hello"), Diff(Operation.INSERT, "Goodbye"), 337 * Diff(Operation.EQUAL, " world.")} 338 * which means: delete "Hello", add "Goodbye" and keep " world." 339 */ 340 enum Operation { 341 DELETE, 342 INSERT, 343 EQUAL 344 } 345 346 347 /** 348 * Struct representing one diff operation. 349 */ 350 struct Diff { 351 Operation operation; 352 string text; 353 354 this(Operation operation, string text) 355 { 356 this.operation = operation; 357 this.text = text; 358 } 359 360 string toString() 361 { 362 //string prettyText = text.replace('\n', '\u00b6'); 363 string op; 364 final switch(operation) { 365 case Operation.DELETE: 366 op = "DELETE"; break; 367 case Operation.INSERT: 368 op = "INSERT"; break; 369 case Operation.EQUAL: 370 op = "EQUAL"; break; 371 } 372 return "Diff(" ~ op ~ ",\"" ~ text ~ "\")"; 373 } 374 375 bool opEquals(const Diff other) const 376 { 377 return operation == other.operation && text == other.text; 378 } 379 } 380 381 382 /** 383 * Find the differences between two texts. 384 * Run a faster, slightly less optimal diff. 385 * This method allows the 'checklines' of diff_main() to be optional. 386 * Most of the time checklines is wanted, so default to true. 387 * @param text1 Old string to be diffed. 388 * @param text2 New string to be diffed. 389 * @return List of Diff objects. 390 */ 391 Diff[] diff_main(string text1, string text2) 392 { 393 return diff_main(text1, text2, true); 394 } 395 396 /** 397 * Find the differences between two texts. 398 * @param text1 Old string to be diffed. 399 * @param text2 New string to be diffed. 400 * @param checklines Speedup flag. If false, then don't run a 401 * line-level diff first to identify the changed areas. 402 * If true, then run a faster slightly less optimal diff. 403 * @return List of Diff objects. 404 */ 405 Diff[] diff_main(string text1, string text2, bool checklines) 406 { 407 // Set a deadline by which time the diff must be complete. 408 SysTime deadline; 409 if (diffTimeout <= 0.seconds) { 410 deadline = SysTime.max; 411 } else { 412 deadline = Clock.currTime(UTC()) + diffTimeout; 413 } 414 return diff_main(text1, text2, checklines, deadline); 415 } 416 417 /** 418 * Find the differences between two texts. Simplifies the problem by 419 * stripping any common prefix or suffix off the texts before diffing. 420 * @param text1 Old string to be diffed. 421 * @param text2 New string to be diffed. 422 * @param checklines Speedup flag. If false, then don't run a 423 * line-level diff first to identify the changed areas. 424 * If true, then run a faster slightly less optimal diff. 425 * @param deadline Time when the diff should be complete by. Used 426 * internally for recursive calls. Users should set DiffTimeout 427 * instead. 428 * @return List of Diff objects. 429 */ 430 Diff[] diff_main(string text1, string text2, bool checklines, SysTime deadline) 431 { 432 Diff[] diffs; 433 if( text1 == text2 ){ 434 if( text1.length != 0 ) diffs ~= Diff(Operation.EQUAL, text1); 435 return diffs; 436 } 437 438 auto pos = commonPrefix(text1, text2); 439 auto prefix = text1.substr(0, pos); 440 text1 = text1.substr(pos); 441 text2 = text2.substr(pos); 442 443 pos = commonSuffix(text1, text2); 444 auto suffix = text1.substr(text1.length - pos); 445 text1 = text1.substr(0, text1.length - pos); 446 text2 = text2.substr(0, text2.length - pos); 447 448 // Compute the diff on the middle block. 449 diffs = computeDiffs(text1, text2, checklines, deadline); 450 451 // Restore the prefix and suffix. 452 if( prefix.length != 0 ) { 453 diffs.insert(0, [Diff(Operation.EQUAL, prefix)]); 454 } 455 if( suffix.length != 0 ) { 456 diffs ~= Diff(Operation.EQUAL, suffix); 457 } 458 459 cleanupMerge(diffs); 460 return diffs; 461 } 462 463 464 465 struct HalfMatch { 466 string prefix1; 467 string suffix1; 468 string suffix2; 469 string prefix2; 470 string commonMiddle; 471 472 bool opEquals()(auto ref const HalfMatch other) const { 473 return prefix1 == other.prefix1 && 474 suffix1 == other.suffix1 && 475 prefix2 == other.prefix2 && 476 suffix2 == other.suffix2; 477 } 478 } 479 /* 480 * Do the two texts share a Substring which is at least half the length of 481 * the longer text? 482 * This speedup can produce non-minimal diffs. 483 * @param text1 First string. 484 * @param text2 Second string. 485 * @return Five element String array, containing the prefix of text1, the 486 * suffix of text1, the prefix of text2, the suffix of text2 and the 487 * common middle. Or null if there was no match. 488 */ 489 bool halfMatch(string text1, string text2, out HalfMatch halfmatch){ 490 if (diffTimeout <= 0.seconds) { 491 // Don't risk returning a non-optimal diff if we have unlimited time. 492 return false; 493 } 494 string longtext = text1.length > text2.length ? text1 : text2; 495 string shorttext = text1.length > text2.length ? text2 : text1; 496 if( longtext.length < 4 || shorttext.length * 2 < longtext.length ) return false; //pointless 497 HalfMatch hm1; 498 HalfMatch hm2; 499 auto is_hm1 = halfMatchI(longtext, shorttext, (longtext.length + 3) / 4, hm1); 500 auto is_hm2 = halfMatchI(longtext, shorttext, (longtext.length + 1) / 2, hm2); 501 HalfMatch hm; 502 if( !is_hm1 && !is_hm2 ){ 503 return false; 504 } else if( !is_hm2 ){ 505 hm = hm1; 506 } else if( !is_hm1 ){ 507 hm = hm2; 508 } else { 509 hm = hm1.commonMiddle.length > hm2.commonMiddle.length ? hm1 : hm2; 510 } 511 512 if( text1.length > text2.length ) { 513 halfmatch = hm; 514 return true; 515 } 516 halfmatch.prefix1 = hm.prefix2; 517 halfmatch.suffix1 = hm.suffix2; 518 halfmatch.prefix2 = hm.prefix1; 519 halfmatch.suffix2 = hm.suffix1; 520 halfmatch.commonMiddle = hm.commonMiddle; 521 return true; 522 } 523 524 525 bool halfMatchI(string longtext, string shorttext, int i, out HalfMatch hm){ 526 auto seed = longtext.substr(i, longtext.length / 4); 527 sizediff_t j = -1; 528 string best_common; 529 string best_longtext_a; 530 string best_longtext_b; 531 string best_shorttext_a; 532 string best_shorttext_b; 533 while( j < cast(sizediff_t)shorttext.length && ( j = shorttext.indexOfAlt(seed, j + 1)) != -1 ){ 534 auto prefixLen = commonPrefix(longtext.substr(i), shorttext.substr(j)); 535 auto suffixLen = commonSuffix(longtext.substr(0, i), shorttext.substr(0, j)); 536 if( best_common.length < suffixLen + prefixLen ) { 537 best_common = shorttext.substr(j - suffixLen, suffixLen) ~ shorttext.substr(j, prefixLen); 538 best_longtext_a = longtext.substr(0, i - suffixLen); 539 best_longtext_b = longtext.substr(i + prefixLen); 540 best_shorttext_a = shorttext.substr(0, j - suffixLen); 541 best_shorttext_b = shorttext.substr(j + prefixLen); 542 } 543 } 544 if( best_common.length * 2 >= longtext.length ) { 545 hm.prefix1 = best_longtext_a; 546 hm.suffix1 = best_longtext_b; 547 hm.prefix2 = best_shorttext_a; 548 hm.suffix2 = best_shorttext_b; 549 hm.commonMiddle = best_common; 550 return true; 551 } else { 552 return false; 553 } 554 } 555 556 557 /** 558 * Find the differences between two texts. Assumes that the texts do not 559 * have any common prefix or suffix. 560 * @param text1 Old string to be diffed. 561 * @param text2 New string to be diffed. 562 * @param checklines Speedup flag. If false, then don't run a 563 * line-level diff first to identify the changed areas. 564 * If true, then run a faster slightly less optimal diff. 565 * @param deadline Time when the diff should be complete by. 566 * @return List of Diff objects. 567 */ 568 Diff[] computeDiffs(string text1, string text2, bool checklines, SysTime deadline) 569 { 570 Diff[] diffs; 571 572 if( text1.length == 0 ){ 573 diffs ~= Diff(Operation.INSERT, text2); 574 return diffs; 575 } 576 if( text2.length == 0 ){ 577 diffs ~= Diff(Operation.DELETE, text1); 578 return diffs; 579 } 580 581 auto longtext = text1.length > text2.length ? text1 : text2; 582 auto shorttext = text1.length > text2.length ? text2 : text1; 583 auto i = longtext.indexOf(shorttext); 584 if( i != -1 ){ 585 Operation op = (text1.length > text2.length) ? Operation.DELETE : Operation.INSERT; 586 diffs ~= Diff(op, longtext.substr(0, i)); 587 diffs ~= Diff(Operation.EQUAL, shorttext); 588 diffs ~= Diff(op, longtext.substr(i + shorttext.length)); 589 return diffs; 590 } 591 592 if( shorttext.length == 1 ){ 593 diffs ~= Diff(Operation.DELETE, text1); 594 diffs ~= Diff(Operation.INSERT, text2); 595 return diffs; 596 } 597 HalfMatch hm; 598 auto is_hm = halfMatch(text1, text2, hm); 599 if( is_hm ){ 600 auto diffs_a = diff_main(hm.prefix1, hm.prefix2, checklines, deadline); 601 auto diffs_b = diff_main(hm.suffix1, hm.suffix2, checklines, deadline); 602 603 diffs = diffs_a; 604 diffs ~= Diff(Operation.EQUAL, hm.commonMiddle); 605 diffs ~= diffs_b; 606 return diffs; 607 } 608 609 if( checklines && text1.length > 100 && text2.length > 100 ){ 610 return diff_lineMode(text1, text2, deadline); 611 } 612 613 return bisect(text1, text2, deadline); 614 } 615 616 Diff[] diff_lineMode(string text1, string text2, SysTime deadline) 617 { 618 auto b = linesToChars(text1, text2); 619 620 auto diffs = diff_main(b.text1, b.text2, false, deadline); 621 622 charsToLines(diffs, b.uniqueStrings); 623 cleanupSemantic(diffs); 624 625 diffs ~= Diff(Operation.EQUAL, ""); 626 auto pointer = 0; 627 auto count_delete = 0; 628 auto count_insert = 0; 629 string text_delete; 630 string text_insert; 631 while( pointer < diffs.length ){ 632 final switch( diffs[pointer].operation ) { 633 case Operation.INSERT: 634 count_insert++; 635 text_insert ~= diffs[pointer].text; 636 break; 637 case Operation.DELETE: 638 count_delete++; 639 text_delete ~= diffs[pointer].text; 640 break; 641 case Operation.EQUAL: 642 if( count_delete >= 1 && count_insert >= 1 ){ 643 644 diffs.remove(pointer - count_delete - count_insert, 645 count_delete + count_insert); 646 647 pointer = pointer - count_delete - count_insert; 648 649 auto a = diff_main(text_delete, text_insert, false, deadline); 650 diffs.insert(pointer, a); 651 pointer += a.length; 652 } 653 count_insert = 0; 654 count_delete = 0; 655 text_delete = ""; 656 text_insert = ""; 657 break; 658 } 659 pointer++; 660 } 661 diffs.remove(diffs.length - 1); 662 return diffs; 663 } 664 665 Diff[] bisect(string text1, string text2, SysTime deadline) 666 { 667 auto text1_len = text1.length; 668 auto text2_len = text2.length; 669 int max_d = (text1_len + text2_len + 1) / 2; 670 int v_offset = max_d; 671 int v_len = 2 * max_d; 672 int[] v1; 673 int[] v2; 674 for( auto x = 0; x < v_len; x++ ){ 675 v1 ~= -1; 676 v2 ~= -1; 677 } 678 v1[v_offset + 1] = 0; 679 v2[v_offset + 1] = 0; 680 int delta = text1_len - text2_len; 681 bool front = (delta % 2 != 0); 682 auto k1start = 0; 683 auto k1end = 0; 684 auto k2start = 0; 685 auto k2end = 0; 686 for( auto d = 0; d < max_d; d++ ){ 687 // Bail out if deadline is reached. 688 if (Clock.currTime(UTC()) > deadline) { 689 break; 690 } 691 692 for( int k1 = -d + k1start; k1 <= d - k1end; k1 += 2 ){ 693 auto k1_offset = v_offset + k1; 694 int x1; 695 if( k1 == -d || k1 != d && v1[k1_offset - 1] < v1[k1_offset + 1] ) { 696 x1 = v1[k1_offset + 1]; 697 } else { 698 x1 = v1[k1_offset - 1] + 1; 699 } 700 auto y1 = x1 - k1; 701 while( x1 < text1_len && y1 < text2_len && text1[x1] == text2[y1] ){ 702 x1++; 703 y1++; 704 } 705 v1[k1_offset] = x1; 706 if( x1 > text1_len) { 707 k1end += 2; 708 } else if( y1 > text2_len ){ 709 k1start += 2; 710 } else if( front ){ 711 auto k2_offset = v_offset + delta - k1; 712 if( k2_offset >= 0 && k2_offset < v_len && v2[k2_offset] != -1) { 713 auto x2 = text1_len - v2[k2_offset]; 714 if( x1 >= x2 ) return bisectSplit(text1, text2, x1, y1, deadline); 715 } 716 } 717 } 718 for( auto k2 = -d + k2start; k2 <= d - k2end; k2 += 2) { 719 auto k2_offset = v_offset + k2; 720 int x2; 721 if (k2 == -d || k2 != d && v2[k2_offset - 1] < v2[k2_offset + 1]) { 722 x2 = v2[k2_offset + 1]; 723 } else { 724 x2 = v2[k2_offset - 1] + 1; 725 } 726 auto y2 = x2 - k2; 727 while( x2 < text1_len && y2 < text2_len 728 && text1[text1_len - x2 - 1] 729 == text2[text2_len - y2 - 1] ){ 730 x2++; 731 y2++; 732 } 733 v2[k2_offset] = x2; 734 if (x2 > text1_len) { 735 // Ran off the left of the graph. 736 k2end += 2; 737 } else if (y2 > text2_len) { 738 // Ran off the top of the graph. 739 k2start += 2; 740 } else if (!front) { 741 auto k1_offset = v_offset + delta - k2; 742 if (k1_offset >= 0 && k1_offset < v_len && v1[k1_offset] != -1) { 743 auto x1 = v1[k1_offset]; 744 auto y1 = v_offset + x1 - k1_offset; 745 // Mirror x2 onto top-left coordinate system. 746 x2 = text1_len - v2[k2_offset]; 747 if (x1 >= x2) { 748 // Overlap detected. 749 return bisectSplit(text1, text2, x1, y1, deadline); 750 } 751 } 752 } 753 } 754 } 755 Diff[] diffs; 756 diffs ~= Diff(Operation.DELETE, text1); 757 diffs ~= Diff(Operation.INSERT, text2); 758 return diffs; 759 } 760 761 762 Diff[] bisectSplit(string text1, string text2, int x, int y, SysTime deadline) 763 { 764 auto text1a = text1.substr(0, x); 765 auto text2a = text2.substr(0, y); 766 auto text1b = text1.substr(x); 767 auto text2b = text2.substr(y); 768 769 Diff[] diffs = diff_main(text1a, text2a, false, deadline); 770 Diff[] diffsb = diff_main(text1b, text2b, false, deadline); 771 diffs ~= diffsb; 772 return diffs; 773 } 774 775 void cleanupSemantic(ref Diff[] diffs) 776 { 777 auto changes = false; 778 int[] equalities; 779 780 string last_equality = null; 781 auto pointer = 0; 782 auto length_insertions1 = 0; 783 auto length_deletions1 = 0; 784 auto length_insertions2 = 0; 785 auto length_deletions2 = 0; 786 787 while( pointer < diffs.length) { 788 if( diffs[pointer].operation == Operation.EQUAL ){ 789 equalities ~= pointer; 790 length_insertions1 = length_insertions2; 791 length_deletions1 = length_deletions2; 792 length_insertions2 = 0; 793 length_deletions2 = 0; 794 last_equality = diffs[pointer].text; 795 } else { 796 if( diffs[pointer].operation == Operation.INSERT ){ 797 length_insertions2 += diffs[pointer].text.length; 798 } else { 799 length_deletions2 += diffs[pointer].text.length; 800 } 801 802 if( last_equality != null && 803 (last_equality.length <= max(length_insertions1, length_deletions1)) 804 && (last_equality.length <= max(length_insertions2, length_deletions2))) { 805 // Duplicate record. 806 diffs.insert(equalities[$-1], [Diff(Operation.DELETE, last_equality)]); 807 diffs[equalities[$-1]+1] = Diff(Operation.INSERT, diffs[equalities[$-1]+1].text); 808 809 //Correct to pop twice ?????? 810 equalities = equalities[0..$-1]; 811 if( equalities.length > 0 ){ 812 equalities = equalities[0..$-1]; 813 } 814 pointer = equalities.length > 0 ? equalities[$-1] : -1; 815 length_insertions1 = 0; 816 length_deletions1 = 0; 817 length_insertions2 = 0; 818 length_deletions2 = 0; 819 last_equality = null; 820 changes = true; 821 } 822 } 823 pointer++; 824 } 825 826 if( changes ) { 827 cleanupMerge(diffs); 828 } 829 cleanupSemanticLossless(diffs); 830 pointer = 1; 831 while( pointer < diffs.length ){ 832 if( diffs[pointer - 1].operation == Operation.DELETE && 833 diffs[pointer].operation == Operation.INSERT) { 834 auto deletion = diffs[pointer - 1].text; 835 auto insertion = diffs[pointer].text; 836 auto overlap_len1 = commonOverlap(deletion, insertion); 837 auto overlap_len2 = commonOverlap(insertion, deletion); 838 if( overlap_len1 >= overlap_len2 ){ 839 if( overlap_len1 >= deletion.length / 2.0 || 840 overlap_len1 >= insertion.length / 2.0) { 841 //Overlap found. 842 //Insert an equality and trim the surrounding edits. 843 diffs.insert(pointer, [Diff(Operation.EQUAL, insertion.substr(0, overlap_len1))]); 844 diffs[pointer - 1].text = deletion.substr(0, deletion.length - overlap_len1); 845 diffs[pointer + 1].text = insertion.substr(0, deletion.length - overlap_len1); 846 pointer++; 847 } 848 } else { 849 if( overlap_len2 > deletion.length / 2.0 || 850 overlap_len2 > insertion.length / 2.0) { 851 diffs.insert(pointer, [Diff(Operation.EQUAL, deletion.substr(0, overlap_len2))]); 852 853 diffs[pointer - 1].operation = Operation.INSERT; 854 diffs[pointer - 1].text = insertion.substr(0, insertion.length - overlap_len2); 855 diffs[pointer + 1].operation = Operation.DELETE; 856 diffs[pointer + 1].text = deletion.substr(overlap_len2); 857 pointer++; 858 } 859 } 860 pointer++; 861 } 862 pointer++; 863 } 864 } 865 866 /** 867 * Look for single edits surrounded on both sides by equalities 868 * which can be shifted sideways to align the edit to a word boundary. 869 * e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came. 870 * @param diffs List of Diff objects. 871 */ 872 void cleanupSemanticLossless(ref Diff[] diffs) 873 { 874 auto pointer = 1; 875 // Intentionally ignore the first and last element (don't need checking). 876 while( pointer < cast(sizediff_t)(diffs.length) - 1 ){ 877 if( diffs[pointer-1].operation == Operation.EQUAL && 878 diffs[pointer+1].operation == Operation.EQUAL) { 879 // This is a single edit surrounded by equalities. 880 auto equality1 = diffs[pointer-1].text; 881 auto edit = diffs[pointer].text; 882 auto equality2 = diffs[pointer+1].text; 883 884 // First, shift the edit as far left as possible 885 auto commonOffset = commonSuffix(equality1, edit); 886 if( commonOffset > 0 ){ 887 auto commonString = edit.substr(edit.length - commonOffset); 888 equality1 = equality1.substr(0, equality1.length - commonOffset); 889 edit = commonString ~ edit.substr(0, edit.length - commonOffset); 890 equality2 = commonString ~ equality2; 891 } 892 893 // Second, step character by character right, 894 // looking for the best fit. 895 auto best_equality1 = equality1; 896 auto best_edit = edit; 897 auto best_equality2 = equality2; 898 auto best_score = cleanupSemanticScore(equality1, edit) + cleanupSemanticScore(edit, equality2); 899 while( edit.length != 0 && equality2.length != 0 && edit[0] == equality2[0]){ 900 equality1 ~= edit[0]; 901 edit = edit.substr(1) ~ equality2[0]; 902 equality2 = equality2.substr(1); 903 auto score = cleanupSemanticScore(equality1, edit) + cleanupSemanticScore(edit, equality2); 904 // The >= encourages trailing rather than leading whitespace on 905 // edits. 906 if (score >= best_score) { 907 best_score = score; 908 best_equality1 = equality1; 909 best_edit = edit; 910 best_equality2 = equality2; 911 } 912 } 913 914 if( diffs[pointer-1].text != best_equality1 ){ 915 // We have an improvement, save it back to the diff. 916 if( best_equality1.length != 0) { 917 diffs[pointer-1].text = best_equality1; 918 } else { 919 diffs.remove(pointer - 1); 920 pointer--; 921 } 922 diffs[pointer].text = best_edit; 923 if( best_equality2.length != 0 ){ 924 diffs[pointer+1].text = best_equality2; 925 } else { 926 diffs.remove(pointer + 1); 927 pointer--; 928 } 929 } 930 } 931 pointer++; 932 } 933 } 934 935 936 937 938 /** 939 * Reorder and merge like edit sections. Merge equalities. 940 * Any edit section can move as long as it doesn't cross an equality. 941 * @param diffs List of Diff objects. 942 */ 943 void cleanupMerge(ref Diff[] diffs) { 944 diffs ~= Diff(Operation.EQUAL, ""); 945 auto pointer = 0; 946 auto count_delete = 0; 947 auto count_insert = 0; 948 string text_delete; 949 string text_insert; 950 auto commonlength = 0; 951 while(pointer < diffs.length) { 952 final switch(diffs[pointer].operation){ 953 case Operation.INSERT: 954 count_insert++; 955 text_insert ~= diffs[pointer].text; 956 pointer++; 957 break; 958 case Operation.DELETE: 959 count_delete++; 960 text_delete ~= diffs[pointer].text; 961 pointer++; 962 break; 963 case Operation.EQUAL: 964 // Upon reaching an equality, check for prior redundancies. 965 if (count_delete + count_insert > 1) { 966 if (count_delete != 0 && count_insert != 0) { 967 // Factor out any common prefixies. 968 commonlength = commonPrefix(text_insert, text_delete); 969 if (commonlength != 0) { 970 if ((pointer - count_delete - count_insert) > 0 && 971 diffs[pointer - count_delete - count_insert - 1].operation 972 == Operation.EQUAL) { 973 diffs[pointer - count_delete - count_insert - 1].text 974 ~= text_insert.substr(0, commonlength); 975 } else { 976 diffs.insert(0, [Diff(Operation.EQUAL, text_insert.substr(0, commonlength))]); 977 pointer++; 978 } 979 text_insert = text_insert.substr(commonlength); 980 text_delete = text_delete.substr(commonlength); 981 } 982 // Factor out any common suffixies. 983 commonlength = commonSuffix(text_insert, text_delete); 984 if (commonlength != 0) { 985 diffs[pointer].text = text_insert.substr(text_insert.length - commonlength) ~ diffs[pointer].text; 986 text_insert = text_insert.substr(0, text_insert.length - commonlength); 987 text_delete = text_delete.substr(0, text_delete.length - commonlength); 988 } 989 } 990 // Delete the offending records and add the merged ones. 991 if (count_delete == 0) { 992 diffs.splice(pointer - count_insert, count_delete + count_insert, [Diff(Operation.INSERT, text_insert)]); 993 } else if (count_insert == 0) { 994 995 diffs.splice(pointer - count_delete, count_delete + count_insert, [Diff(Operation.DELETE, text_delete)]); 996 } else { 997 diffs.splice(pointer - count_delete - count_insert, count_delete + count_insert, [Diff(Operation.DELETE, text_delete), Diff(Operation.INSERT, text_insert)]); 998 } 999 pointer = pointer - count_delete - count_insert + 1000 (count_delete != 0 ? 1 : 0) + (count_insert != 0 ? 1 : 0) + 1; 1001 } else if( pointer != 0 && diffs[pointer-1].operation == Operation.EQUAL ){ 1002 diffs[pointer - 1].text ~= diffs[pointer].text; 1003 diffs.remove(pointer); 1004 } else { 1005 pointer++; 1006 } 1007 count_insert = 0; 1008 count_delete = 0; 1009 text_delete = ""; 1010 text_insert = ""; 1011 break; 1012 } 1013 } 1014 if( diffs[$-1].text.length == 0){ 1015 diffs.remove(diffs.length - 1); 1016 } 1017 1018 bool changes = false; 1019 pointer = 1; 1020 while( pointer < (cast(sizediff_t)diffs.length - 1) ) { 1021 if( diffs[pointer - 1].operation == Operation.EQUAL && 1022 diffs[pointer + 1].operation == Operation.EQUAL) { 1023 if( diffs[pointer].text.endsWith(diffs[pointer - 1].text)) { 1024 diffs[pointer].text = diffs[pointer - 1].text ~ diffs[pointer].text.substr(0, diffs[pointer].text.length - diffs[pointer - 1].text.length); 1025 diffs[pointer + 1].text = diffs[pointer - 1].text ~ diffs[pointer + 1].text; 1026 diffs.splice(pointer - 1, 1); 1027 changes = true; 1028 } else if( diffs[pointer].text.startsWith(diffs[pointer + 1].text)) { 1029 diffs[pointer - 1].text ~= diffs[pointer + 1].text; 1030 diffs[pointer].text = 1031 diffs[pointer].text.substr(diffs[pointer + 1].text.length) 1032 ~ diffs[pointer + 1].text; 1033 diffs.splice(pointer + 1, 1); 1034 changes = true; 1035 } 1036 } 1037 pointer++; 1038 } 1039 if( changes ) cleanupMerge(diffs); 1040 1041 } 1042 1043 1044 1045 /** 1046 * Given two strings, comAdde a score representing whether the internal 1047 * boundary falls on logical boundaries. 1048 * Scores range from 6 (best) to 0 (worst). 1049 * @param one First string. 1050 * @param two Second string. 1051 * @return The score. 1052 */ 1053 int cleanupSemanticScore(string one, string two) 1054 { 1055 if( one.length == 0 || two.length == 0) return 6; //Edges are the best 1056 auto char1 = one[$-1]; 1057 auto char2 = two[0]; 1058 1059 auto nonAlphaNumeric1 = !(isAlpha(char1) || isNumber(char1)); 1060 auto nonAlphaNumeric2 = !(isAlpha(char2) || isNumber(char2)); 1061 auto whitespace1 = nonAlphaNumeric1 && isWhite(char1); 1062 auto whitespace2 = nonAlphaNumeric2 && isWhite(char2); 1063 auto lineBreak1 = whitespace1 && isControl(char1); 1064 auto lineBreak2 = whitespace2 && isControl(char2); 1065 auto blankLine1 = lineBreak1 && match(one, `\n\r?\n\Z`); 1066 auto blankLine2 = lineBreak2 && match(two, `\A\r?\n\r?\n`); 1067 1068 if (blankLine1 || blankLine2) return 5; 1069 else if (lineBreak1 || lineBreak2) return 4; 1070 else if (nonAlphaNumeric1 && !whitespace1 && whitespace2) return 3; 1071 else if (whitespace1 || whitespace2) return 2; 1072 else if (nonAlphaNumeric1 || nonAlphaNumeric2) return 1; 1073 1074 return 0; 1075 } 1076 1077 1078 /** 1079 * Reduce the number of edits by eliminating operationally trivial 1080 * equalities. 1081 * @param diffs List of Diff objects. 1082 */ 1083 void cleanupEfficiency(Diff[] diffs) { 1084 bool changes = false; 1085 int[] equalities; 1086 string lastequality; 1087 int pointer = 0; 1088 auto pre_ins = false; 1089 auto pre_del = false; 1090 auto post_ins = false; 1091 auto post_del = false; 1092 while( pointer < diffs.length ){ 1093 if( diffs[pointer].operation == Operation.EQUAL ){ 1094 if( diffs[pointer].text.length < DIFF_EDIT_COST && (post_ins || post_del)) { 1095 equalities ~= pointer; 1096 pre_ins = post_ins; 1097 pre_del = post_del; 1098 lastequality = diffs[pointer].text; 1099 } else { 1100 equalities = []; 1101 lastequality = ""; 1102 } 1103 post_ins = false; 1104 post_del = false; 1105 } else { 1106 if( diffs[pointer].operation == Operation.DELETE ){ 1107 post_del = true; 1108 } else { 1109 post_ins = true; 1110 } 1111 1112 if( (lastequality.length != 0) 1113 && ((pre_ins && pre_del && post_ins && post_del) 1114 || ((lastequality.length < DIFF_EDIT_COST / 2) 1115 && ((pre_ins ? 1 : 0) + (pre_del ? 1 : 0) + (post_ins ? 1 : 0) 1116 + (post_del ? 1 : 0)) == 3))) { 1117 1118 diffs.insert(equalities[$], [Diff(Operation.DELETE, lastequality)]); 1119 diffs[equalities[$] + 1].operation = Operation.INSERT; 1120 equalities = equalities[0..$-1]; 1121 lastequality = ""; 1122 if( pre_ins && pre_del ){ 1123 post_ins = false; 1124 post_ins = false; 1125 equalities = []; 1126 } else { 1127 if( equalities.length > 0 ) equalities = equalities[0..$-1]; 1128 pointer = equalities.length > 0 ? equalities[$] : -1; 1129 post_ins = false; 1130 post_del = false; 1131 } 1132 changes = true; 1133 } 1134 } 1135 pointer++; 1136 } 1137 1138 if( changes ){ 1139 cleanupMerge(diffs); 1140 } 1141 } 1142 1143 /** 1144 * loc is a location in text1, comAdde and return the equivalent location in 1145 * text2. 1146 * e.g. "The cat" vs "The big cat", 1->1, 5->8 1147 * @param diffs List of Diff objects. 1148 * @param loc Location within text1. 1149 * @return Location within text2. 1150 */ 1151 int xIndex(Diff[] diffs, int loc){ 1152 auto chars1 = 0; 1153 auto chars2 = 0; 1154 auto last_chars1 = 0; 1155 auto last_chars2 = 0; 1156 Diff lastDiff; 1157 foreach ( diff; diffs) { 1158 if (diff.operation != Operation.INSERT) { 1159 // Equality or deletion. 1160 chars1 += diff.text.length; 1161 } 1162 if (diff.operation != Operation.DELETE) { 1163 // Equality or insertion. 1164 chars2 += diff.text.length; 1165 } 1166 if (chars1 > loc) { 1167 // Overshot the location. 1168 lastDiff = diff; 1169 break; 1170 } 1171 last_chars1 = chars1; 1172 last_chars2 = chars2; 1173 } 1174 if (lastDiff.operation == Operation.DELETE) { 1175 // The location was deleted. 1176 return last_chars2; 1177 } 1178 // Add the remaining character length. 1179 return last_chars2 + (loc - last_chars1); 1180 } 1181 1182 /** 1183 * Unescape selected chars for compatability with JavaScript's encodeURI. 1184 * In speed critical applications this could be dropped since the 1185 * receiving application will certainly decode these fine. 1186 * Note that this function is case-sensitive. Thus "%3F" would not be 1187 * unescaped. But this is ok because it is only called with the output of 1188 * HttpUtility.UrlEncode which returns lowercase hex. 1189 * 1190 * Example: "%3f" -> "?", "%24" -> "$", etc. 1191 * 1192 * @param str The string to escape. 1193 * @return The escaped string. 1194 */ 1195 public static string unescapeForEncodeUriCompatability(string str) 1196 { 1197 // FIXME: this is ridiculously inefficient 1198 return str.replace("%21", "!").replace("%7e", "~") 1199 .replace("%27", "'").replace("%28", "(").replace("%29", ")") 1200 .replace("%3b", ";").replace("%2f", "/").replace("%3f", "?") 1201 .replace("%3a", ":").replace("%40", "@").replace("%26", "&") 1202 .replace("%3d", "=").replace("%2b", "+").replace("%24", "$") 1203 .replace("%2c", ",").replace("%23", "#"); 1204 }