1 /* 2 * Copyright 2008 Google Inc. All Rights Reserved. 3 * Copyright 2013-2014 Jan Krüger. All Rights Reserved. 4 * Author: fraser@google.com (Neil Fraser) 5 * Author: anteru@developer.shelter13.net (Matthaeus G. Chajdas) 6 * Author: jan@jandoe.de (Jan Krüger) 7 * 8 * Licensed under the Apache License, Version 2.0 (the "License"); 9 * you may not use this file except in compliance with the License. 10 * You may obtain a copy of the License at 11 * 12 * http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * Unless required by applicable law or agreed to in writing, software 15 * distributed under the License is distributed on an "AS IS" BASIS, 16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 * See the License for the specific language governing permissions and 18 * limitations under the License. 19 * 20 * Diff Match and Patch 21 * http://code.google.com/p/google-diff-match-patch/ 22 */ 23 module ddmp.match; 24 25 import std.algorithm : min, max; 26 import std.array; 27 import std.math : abs; 28 import std..string; 29 30 import ddmp.util; 31 32 float MATCH_THRESHOLD = 0.5f; 33 int MATCH_DISTANCE = 1000; 34 35 /** 36 * Locate the best instance of 'pattern' in 'text' near 'loc'. 37 * Returns -1 if no match found. 38 * @param text The text to search. 39 * @param pattern The pattern to search for. 40 * @param loc The location to search around. 41 * @return Best match index or -1. 42 */ 43 sizediff_t match_main(string text, string pattern, sizediff_t loc) 44 { 45 loc = max(0, min(loc, text.length)); 46 if( text == pattern ){ 47 return 0; // Shortcut (potentially not guaranteed by the algorithm) 48 } else if( text.length == 0 ){ 49 return -1; // Nothing to match 50 } else if( loc + pattern.length <= text.length && text.substr(loc, pattern.length) == pattern){ 51 return loc; // Perfect match at the perfect spot! (Includes case of null pattern) 52 } else { 53 return bitap(text, pattern, loc); 54 } 55 } 56 57 58 /** 59 * Locate the best instance of 'pattern' in 'text' near 'loc' using the 60 * Bitap algorithm. Returns -1 if no match found. 61 * @param text The text to search. 62 * @param pattern The pattern to search for. 63 * @param loc The location to search around. 64 * @return Best match index or -1. 65 */ 66 sizediff_t bitap(string text, string pattern, sizediff_t loc) 67 { 68 // bits need to fit into the positive part of an int 69 assert(pattern.length <= 31); 70 71 int[char] s = initAlphabet(pattern); 72 double score_threshold = MATCH_THRESHOLD; 73 auto best_loc = text.indexOfAlt(pattern, loc); 74 if( best_loc != -1 ){ 75 score_threshold = min(bitapScore(0, best_loc, loc, pattern), score_threshold); 76 77 best_loc = text[0..min(loc + pattern.length, text.length)].lastIndexOf(pattern); 78 if( best_loc != -1){ 79 score_threshold = min(bitapScore(0, best_loc, loc, pattern), score_threshold); 80 } 81 } 82 83 sizediff_t matchmask = 1 << (pattern.length - 1); 84 best_loc = -1; 85 86 sizediff_t bin_min; 87 sizediff_t bin_mid; 88 sizediff_t bin_max = pattern.length + text.length; 89 90 sizediff_t[] last_rd; 91 for(sizediff_t d = 0; d < pattern.length; d++){ 92 // Scan for the best match; each iteration allows for one more error. 93 // Run a binary search to determine how far from 'loc' we can stray at 94 // this error level. 95 bin_min = 0; 96 bin_mid = bin_max; 97 while( bin_min < bin_mid ){ 98 if( bitapScore(d, loc + bin_mid, loc, pattern) <= score_threshold){ 99 bin_min = bin_mid; 100 } else { 101 bin_max = bin_mid; 102 } 103 bin_mid = (bin_max - bin_min) / 2 + bin_min; 104 } 105 bin_max = bin_mid; 106 sizediff_t start = max(1, loc - bin_mid + 1); 107 sizediff_t finish = min(loc + bin_mid, text.length) + pattern.length; 108 109 sizediff_t[] rd = new sizediff_t[finish + 2]; 110 rd[finish + 1] = (1 << d) - 1; 111 for( sizediff_t j = finish; j >= start; j--) { 112 sizediff_t charMatch; 113 if( text.length <= j - 1 || !( text[j - 1] in s) ) { 114 charMatch = 0; 115 } else { 116 charMatch = s[text[j - 1]]; 117 } 118 if( d == 0 ){ 119 rd[j] = ((rd[j + 1] << 1) | 1) & charMatch; 120 } else { 121 rd[j] = ((rd[j + 1] << 1) | 1) & charMatch | (((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1]; 122 } 123 if( (rd[j] & matchmask) != 0) { 124 auto score = bitapScore(d, j - 1, loc, pattern); 125 if( score <= score_threshold ){ 126 score_threshold = score; 127 best_loc = j - 1; 128 if( best_loc > loc ){ 129 start = max(1, 2 * loc - best_loc); 130 } else { 131 break; 132 } 133 } 134 } 135 } 136 if( bitapScore(d + 1, loc, loc, pattern) > score_threshold) { 137 break; 138 } 139 last_rd = rd; 140 } 141 return best_loc; 142 } 143 144 /** 145 * Compute and return the score for a match with e errors and x location. 146 * @param e Number of errors in match. 147 * @param x Location of match. 148 * @param loc Expected location of match. 149 * @param pattern Pattern being sought. 150 * @return Overall score for match (0.0 = good, 1.0 = bad). 151 */ 152 double bitapScore(sizediff_t e, sizediff_t x, sizediff_t loc, string pattern) 153 { 154 auto accuracy = cast(float)e / pattern.length; 155 sizediff_t proximity = abs(loc - x); 156 if( MATCH_DISTANCE == 0 ){ 157 return proximity == 0 ? accuracy : 1.0; 158 } 159 return accuracy + (proximity / cast(float)MATCH_DISTANCE); 160 } 161 162 /** 163 * Initialise the alphabet for the Bitap algorithm. 164 * @param pattern The text to encode. 165 * @return Hash of character locations. 166 */ 167 int[char] initAlphabet(string pattern) 168 { 169 int[char] s; 170 foreach( c ; pattern ){ 171 if( c !in s )s[c] = 0; 172 } 173 foreach( i, c; pattern ){ 174 auto value = s[c] | (1 << (pattern.length - i - 1)); 175 s[c] = value; 176 } 177 return s; 178 }