1 /*
2  * Copyright 2008 Google Inc. All Rights Reserved.
3  * Copyright 2013-2014 Jan Krüger. All Rights Reserved.
4  * Author: fraser@google.com (Neil Fraser)
5  * Author: anteru@developer.shelter13.net (Matthaeus G. Chajdas)
6  * Author: jan@jandoe.de (Jan Krüger)
7  *
8  * Licensed under the Apache License, Version 2.0 (the "License");
9  * you may not use this file except in compliance with the License.
10  * You may obtain a copy of the License at
11  *
12  *   http://www.apache.org/licenses/LICENSE-2.0
13  *
14  * Unless required by applicable law or agreed to in writing, software
15  * distributed under the License is distributed on an "AS IS" BASIS,
16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  * See the License for the specific language governing permissions and
18  * limitations under the License.
19  *
20  * Diff Match and Patch
21  * http://code.google.com/p/google-diff-match-patch/
22  */
23 module ddmp.match;
24 
25 import std.algorithm : min, max;
26 import std.array;
27 import std.math : abs;
28 import std..string;
29 
30 import ddmp.util;
31 
32 float MATCH_THRESHOLD = 0.5f;
33 int MATCH_DISTANCE = 1000; 
34 
35 /**
36  * Locate the best instance of 'pattern' in 'text' near 'loc'.
37  * Returns -1 if no match found.
38  * @param text The text to search.
39  * @param pattern The pattern to search for.
40  * @param loc The location to search around.
41  * @return Best match index or -1.
42  */
43 sizediff_t match_main(string text, string pattern, sizediff_t loc)
44 {
45 	loc = max(0, min(loc, text.length));
46 	if( text == pattern ){
47 		return 0; // Shortcut (potentially not guaranteed by the algorithm)
48 	} else if( text.length == 0 ){
49 		return -1; // Nothing to match
50 	} else if( loc + pattern.length <= text.length && text.substr(loc, pattern.length) == pattern){
51 		return loc; // Perfect match at the perfect spot! (Includes case of null pattern)
52 	} else {
53 		return bitap(text, pattern, loc);
54 	}
55 }
56 
57 
58 /**
59  * Locate the best instance of 'pattern' in 'text' near 'loc' using the
60  * Bitap algorithm.  Returns -1 if no match found.
61  * @param text The text to search.
62  * @param pattern The pattern to search for.
63  * @param loc The location to search around.
64  * @return Best match index or -1.
65  */
66 sizediff_t bitap(string text, string pattern, sizediff_t loc)
67 {
68 	// bits need to fit into the positive part of an int
69 	assert(pattern.length <= 31);
70 
71 	int[char] s = initAlphabet(pattern);
72 	double score_threshold = MATCH_THRESHOLD;
73 	auto best_loc = text.indexOfAlt(pattern, loc);
74 	if( best_loc != -1 ){
75 		score_threshold = min(bitapScore(0, best_loc, loc, pattern), score_threshold);
76 
77 		best_loc = text[0..min(loc + pattern.length, text.length)].lastIndexOf(pattern);
78 		if( best_loc != -1){
79 			score_threshold = min(bitapScore(0, best_loc, loc, pattern), score_threshold);
80 		}		
81 	}
82 
83 	sizediff_t matchmask = 1 << (pattern.length - 1);
84 	best_loc = -1;
85 
86 	sizediff_t bin_min;
87 	sizediff_t bin_mid;
88 	sizediff_t bin_max = pattern.length + text.length;
89 
90 	sizediff_t[] last_rd;
91     for(sizediff_t d = 0; d < pattern.length; d++){
92         // Scan for the best match; each iteration allows for one more error.
93         // Run a binary search to determine how far from 'loc' we can stray at
94         // this error level.
95         bin_min = 0;
96         bin_mid = bin_max;
97         while( bin_min < bin_mid ){
98         	if( bitapScore(d, loc + bin_mid, loc, pattern) <= score_threshold){
99         		bin_min = bin_mid;
100         	} else {
101         		bin_max = bin_mid;
102         	}
103         	bin_mid = (bin_max - bin_min) / 2 + bin_min;
104         }
105         bin_max = bin_mid;
106         sizediff_t start = max(1, loc - bin_mid + 1);
107         sizediff_t finish = min(loc + bin_mid, text.length) + pattern.length;
108 		
109 		sizediff_t[] rd = new sizediff_t[finish + 2];
110 		rd[finish + 1] = (1 << d) - 1;
111 		for( sizediff_t j = finish; j >= start; j--) {
112 			sizediff_t charMatch;
113 			if( text.length <= j - 1 || !( text[j - 1] in s) ) {
114 				charMatch = 0;
115 			} else {
116 				charMatch = s[text[j - 1]];
117 			}
118 			if( d == 0 ){
119 				rd[j] = ((rd[j + 1] << 1) | 1) & charMatch;
120 			} else {
121 				rd[j] = ((rd[j + 1] << 1) | 1) & charMatch | (((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1];
122 			}
123 			if( (rd[j] & matchmask) != 0) {
124 				auto score = bitapScore(d, j - 1, loc, pattern);
125 				if( score <= score_threshold ){
126 					score_threshold = score;
127 					best_loc = j - 1;
128 					if( best_loc > loc ){
129 						start = max(1, 2 * loc - best_loc);
130 					} else {
131 						break;
132 					}
133 				}
134 			}
135 		}
136 		if( bitapScore(d + 1, loc, loc, pattern) > score_threshold) {
137 			break;
138 		}
139 		last_rd = rd;
140     }
141     return best_loc;
142 }
143 
144 /**
145  * Compute and return the score for a match with e errors and x location.
146  * @param e Number of errors in match.
147  * @param x Location of match.
148  * @param loc Expected location of match.
149  * @param pattern Pattern being sought.
150  * @return Overall score for match (0.0 = good, 1.0 = bad).
151  */
152 double bitapScore(sizediff_t e, sizediff_t x, sizediff_t loc, string pattern) 
153 {
154 	auto accuracy = cast(float)e / pattern.length;
155 	sizediff_t proximity = abs(loc - x);
156 	if( MATCH_DISTANCE == 0 ){
157 		return proximity == 0 ? accuracy : 1.0;
158 	}
159 	return accuracy + (proximity / cast(float)MATCH_DISTANCE);
160 }
161 
162 /**
163  * Initialise the alphabet for the Bitap algorithm.
164  * @param pattern The text to encode.
165  * @return Hash of character locations.
166  */
167 int[char] initAlphabet(string pattern)
168 {
169 	int[char] s;
170 	foreach( c ; pattern ){
171 		if( c !in s )s[c] = 0;
172 	}
173 	foreach( i, c; pattern ){
174 		auto value = s[c] | (1 << (pattern.length - i - 1));
175 		s[c] = value; 
176 	}
177 	return s;
178 }