1 /* 2 * Copyright 2008 Google Inc. All Rights Reserved. 3 * Copyright 2013-2014 Jan Krüger. All Rights Reserved. 4 * Author: fraser@google.com (Neil Fraser) 5 * Author: anteru@developer.shelter13.net (Matthaeus G. Chajdas) 6 * Author: jan@jandoe.de (Jan Krüger) 7 * 8 * Licensed under the Apache License, Version 2.0 (the "License"); 9 * you may not use this file except in compliance with the License. 10 * You may obtain a copy of the License at 11 * 12 * http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * Unless required by applicable law or agreed to in writing, software 15 * distributed under the License is distributed on an "AS IS" BASIS, 16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 * See the License for the specific language governing permissions and 18 * limitations under the License. 19 * 20 * Diff Match and Patch 21 * http://code.google.com/p/google-diff-match-patch/ 22 */ 23 module ddmp.match; 24 25 import std.algorithm : min, max; 26 import std.array; 27 import std.math : abs; 28 import std..string; 29 30 import ddmp.util; 31 32 float MATCH_THRESHOLD = 0.5f; 33 int MATCH_DISTANCE = 1000; 34 35 /** 36 * Locate the best instance of 'pattern' in 'text' near 'loc'. 37 * Returns -1 if no match found. 38 * @param text The text to search. 39 * @param pattern The pattern to search for. 40 * @param loc The location to search around. 41 * @return Best match index or -1. 42 */ 43 int match_main(string text, string pattern, int loc) 44 { 45 loc = max(0, min(loc, text.length)); 46 if( text == pattern ){ 47 return 0; // Shortcut (potentially not guaranteed by the algorithm) 48 } else if( text.length == 0 ){ 49 return -1; // Nothing to match 50 } else if( loc + pattern.length <= text.length && text.substr(loc, pattern.length) == pattern){ 51 return loc; // Perfect match at the perfect spot! (Includes case of null pattern) 52 } else { 53 return bitap(text, pattern, loc); 54 } 55 } 56 57 58 /** 59 * Locate the best instance of 'pattern' in 'text' near 'loc' using the 60 * Bitap algorithm. Returns -1 if no match found. 61 * @param text The text to search. 62 * @param pattern The pattern to search for. 63 * @param loc The location to search around. 64 * @return Best match index or -1. 65 */ 66 int bitap(string text, string pattern, int loc) 67 { 68 int[char] s = initAlphabet(pattern); 69 double score_threshold = MATCH_THRESHOLD; 70 auto best_loc = text.indexOfAlt(pattern, loc); 71 if( best_loc != -1 ){ 72 score_threshold = min(bitapScore(0, best_loc, loc, pattern), score_threshold); 73 74 best_loc = text[0..min(loc + pattern.length, text.length)].lastIndexOf(pattern); 75 if( best_loc != -1){ 76 score_threshold = min(bitapScore(0, best_loc, loc, pattern), score_threshold); 77 } 78 } 79 80 int matchmask = 1 << (pattern.length - 1); 81 best_loc = -1; 82 83 int bin_min; 84 int bin_mid; 85 int bin_max = pattern.length + text.length; 86 87 int[] last_rd; 88 for(int d = 0; d < pattern.length; d++){ 89 // Scan for the best match; each iteration allows for one more error. 90 // Run a binary search to determine how far from 'loc' we can stray at 91 // this error level. 92 bin_min = 0; 93 bin_mid = bin_max; 94 while( bin_min < bin_mid ){ 95 if( bitapScore(d, loc + bin_mid, loc, pattern) <= score_threshold){ 96 bin_min = bin_mid; 97 } else { 98 bin_max = bin_mid; 99 } 100 bin_mid = (bin_max - bin_min) / 2 + bin_min; 101 } 102 bin_max = bin_mid; 103 int start = max(1, loc - bin_mid + 1); 104 int finish = min(loc + bin_mid, text.length) + pattern.length; 105 106 int[] rd = new int[finish + 2]; 107 rd[finish + 1] = (1 << d) - 1; 108 for( int j = finish; j >= start; j--) { 109 int charMatch; 110 if( text.length <= j - 1 || !( text[j - 1] in s) ) { 111 charMatch = 0; 112 } else { 113 charMatch = s[text[j - 1]]; 114 } 115 if( d == 0 ){ 116 rd[j] = ((rd[j + 1] << 1) | 1) & charMatch; 117 } else { 118 rd[j] = ((rd[j + 1] << 1) | 1) & charMatch | (((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1]; 119 } 120 if( (rd[j] & matchmask) != 0) { 121 auto score = bitapScore(d, j - 1, loc, pattern); 122 if( score <= score_threshold ){ 123 score_threshold = score; 124 best_loc = j - 1; 125 if( best_loc > loc ){ 126 start = max(1, 2 * loc - best_loc); 127 } else { 128 break; 129 } 130 } 131 } 132 } 133 if( bitapScore(d + 1, loc, loc, pattern) > score_threshold) { 134 break; 135 } 136 last_rd = rd; 137 } 138 return best_loc; 139 } 140 141 /** 142 * Compute and return the score for a match with e errors and x location. 143 * @param e Number of errors in match. 144 * @param x Location of match. 145 * @param loc Expected location of match. 146 * @param pattern Pattern being sought. 147 * @return Overall score for match (0.0 = good, 1.0 = bad). 148 */ 149 double bitapScore(int e, int x, int loc, string pattern) 150 { 151 auto accuracy = cast(float)e / pattern.length; 152 int proximity = abs(loc - x); 153 if( MATCH_DISTANCE == 0 ){ 154 return proximity == 0 ? accuracy : 1.0; 155 } 156 return accuracy + (proximity / cast(float)MATCH_DISTANCE); 157 } 158 159 /** 160 * Initialise the alphabet for the Bitap algorithm. 161 * @param pattern The text to encode. 162 * @return Hash of character locations. 163 */ 164 int[char] initAlphabet(string pattern) 165 { 166 int[char] s; 167 foreach( c ; pattern ){ 168 if( c !in s )s[c] = 0; 169 } 170 foreach( i, c; pattern ){ 171 auto value = s[c] | (1 << (pattern.length - i - 1)); 172 s[c] = value; 173 } 174 return s; 175 }