1 /*
2  * Copyright 2008 Google Inc. All Rights Reserved.
3  * Copyright 2013-2014 Jan Krüger. All Rights Reserved.
4  * Author: fraser@google.com (Neil Fraser)
5  * Author: anteru@developer.shelter13.net (Matthaeus G. Chajdas)
6  * Author: jan@jandoe.de (Jan Krüger)
7  *
8  * Licensed under the Apache License, Version 2.0 (the "License");
9  * you may not use this file except in compliance with the License.
10  * You may obtain a copy of the License at
11  *
12  *   http://www.apache.org/licenses/LICENSE-2.0
13  *
14  * Unless required by applicable law or agreed to in writing, software
15  * distributed under the License is distributed on an "AS IS" BASIS,
16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  * See the License for the specific language governing permissions and
18  * limitations under the License.
19  *
20  * Diff Match and Patch
21  * http://code.google.com/p/google-diff-match-patch/
22  */
23 module ddmp.match;
24 
25 import std.algorithm : min, max;
26 import std.array;
27 import std.math : abs;
28 import std..string;
29 
30 import ddmp.util;
31 
32 float MATCH_THRESHOLD = 0.5f;
33 int MATCH_DISTANCE = 1000; 
34 
35 /**
36  * Locate the best instance of 'pattern' in 'text' near 'loc'.
37  * Returns -1 if no match found.
38  * @param text The text to search.
39  * @param pattern The pattern to search for.
40  * @param loc The location to search around.
41  * @return Best match index or -1.
42  */
43 int match_main(string text, string pattern, int loc)
44 {
45 	loc = max(0, min(loc, text.length));
46 	if( text == pattern ){
47 		return 0; // Shortcut (potentially not guaranteed by the algorithm)
48 	} else if( text.length == 0 ){
49 		return -1; // Nothing to match
50 	} else if( loc + pattern.length <= text.length && text.substr(loc, pattern.length) == pattern){
51 		return loc; // Perfect match at the perfect spot! (Includes case of null pattern)
52 	} else {
53 		return bitap(text, pattern, loc);
54 	}
55 }
56 
57 
58 /**
59  * Locate the best instance of 'pattern' in 'text' near 'loc' using the
60  * Bitap algorithm.  Returns -1 if no match found.
61  * @param text The text to search.
62  * @param pattern The pattern to search for.
63  * @param loc The location to search around.
64  * @return Best match index or -1.
65  */
66 int bitap(string text, string pattern, int loc)
67 {
68 	int[char] s = initAlphabet(pattern);
69 	double score_threshold = MATCH_THRESHOLD;
70 	auto best_loc = text.indexOfAlt(pattern, loc);
71 	if( best_loc != -1 ){
72 		score_threshold = min(bitapScore(0, best_loc, loc, pattern), score_threshold);
73 
74 		best_loc = text[0..min(loc + pattern.length, text.length)].lastIndexOf(pattern);
75 		if( best_loc != -1){
76 			score_threshold = min(bitapScore(0, best_loc, loc, pattern), score_threshold);
77 		}		
78 	}
79 
80 	int matchmask = 1 << (pattern.length - 1);
81 	best_loc = -1;
82 
83 	int bin_min;
84 	int bin_mid;
85 	int bin_max = pattern.length + text.length;
86 
87 	int[] last_rd;
88     for(int d = 0; d < pattern.length; d++){
89         // Scan for the best match; each iteration allows for one more error.
90         // Run a binary search to determine how far from 'loc' we can stray at
91         // this error level.
92         bin_min = 0;
93         bin_mid = bin_max;
94         while( bin_min < bin_mid ){
95         	if( bitapScore(d, loc + bin_mid, loc, pattern) <= score_threshold){
96         		bin_min = bin_mid;
97         	} else {
98         		bin_max = bin_mid;
99         	}
100         	bin_mid = (bin_max - bin_min) / 2 + bin_min;
101         }
102         bin_max = bin_mid;
103         int start = max(1, loc - bin_mid + 1);
104         int finish = min(loc + bin_mid, text.length) + pattern.length;
105 		
106 		int[] rd = new int[finish + 2];
107 		rd[finish + 1] = (1 << d) - 1;
108 		for( int j = finish; j >= start; j--) {
109 			int charMatch;
110 			if( text.length <= j - 1 || !( text[j - 1] in s) ) {
111 				charMatch = 0;
112 			} else {
113 				charMatch = s[text[j - 1]];
114 			}
115 			if( d == 0 ){
116 				rd[j] = ((rd[j + 1] << 1) | 1) & charMatch;
117 			} else {
118 				rd[j] = ((rd[j + 1] << 1) | 1) & charMatch | (((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1];
119 			}
120 			if( (rd[j] & matchmask) != 0) {
121 				auto score = bitapScore(d, j - 1, loc, pattern);
122 				if( score <= score_threshold ){
123 					score_threshold = score;
124 					best_loc = j - 1;
125 					if( best_loc > loc ){
126 						start = max(1, 2 * loc - best_loc);
127 					} else {
128 						break;
129 					}
130 				}
131 			}
132 		}
133 		if( bitapScore(d + 1, loc, loc, pattern) > score_threshold) {
134 			break;
135 		}
136 		last_rd = rd;
137     }
138     return best_loc;
139 }
140 
141 /**
142  * Compute and return the score for a match with e errors and x location.
143  * @param e Number of errors in match.
144  * @param x Location of match.
145  * @param loc Expected location of match.
146  * @param pattern Pattern being sought.
147  * @return Overall score for match (0.0 = good, 1.0 = bad).
148  */
149 double bitapScore(int e, int x, int loc, string pattern) 
150 {
151 	auto accuracy = cast(float)e / pattern.length;
152 	int proximity = abs(loc - x);
153 	if( MATCH_DISTANCE == 0 ){
154 		return proximity == 0 ? accuracy : 1.0;
155 	}
156 	return accuracy + (proximity / cast(float)MATCH_DISTANCE);
157 }
158 
159 /**
160  * Initialise the alphabet for the Bitap algorithm.
161  * @param pattern The text to encode.
162  * @return Hash of character locations.
163  */
164 int[char] initAlphabet(string pattern)
165 {
166 	int[char] s;
167 	foreach( c ; pattern ){
168 		if( c !in s )s[c] = 0;
169 	}
170 	foreach( i, c; pattern ){
171 		auto value = s[c] | (1 << (pattern.length - i - 1));
172 		s[c] = value; 
173 	}
174 	return s;
175 }