No scoring code is changed, so this probably needs more work to be fully
functional.
| ... | ... |
@@ -11,19 +11,83 @@ |
| 11 | 11 |
|
| 12 | 12 |
#include "../config.h" |
| 13 | 13 |
|
| 14 |
+const char *ignore_escapes(const char *haystack) {
|
|
| 15 |
+ enum {
|
|
| 16 |
+ state_default, |
|
| 17 |
+ state_escaped, |
|
| 18 |
+ state_in_csi, |
|
| 19 |
+ state_start_osc, |
|
| 20 |
+ state_in_osc, |
|
| 21 |
+ state_ignore_next, |
|
| 22 |
+ } state = state_default; |
|
| 23 |
+ do switch (state) {
|
|
| 24 |
+ case state_default: switch (*haystack) {
|
|
| 25 |
+ case '\x1b': |
|
| 26 |
+ state = state_escaped; break; |
|
| 27 |
+ case '\x7': |
|
| 28 |
+ return ++haystack; |
|
| 29 |
+ default: |
|
| 30 |
+ return haystack; |
|
| 31 |
+ }; break; |
|
| 32 |
+ case state_escaped: switch (*haystack) {
|
|
| 33 |
+ case '[': |
|
| 34 |
+ state = state_in_csi; break; |
|
| 35 |
+ case ']': |
|
| 36 |
+ state = state_start_osc; break; |
|
| 37 |
+ case '%': case '(': case ')': case '#':
|
|
| 38 |
+ case '0': case '3': case '5': case '6': |
|
| 39 |
+ state = state_ignore_next; break; |
|
| 40 |
+ case '<': case '=': case '>': case '\x7': |
|
| 41 |
+ case '1': case '2': case '7': case '8': |
|
| 42 |
+ case 'c': case 's': case 'u': |
|
| 43 |
+ case 'A': case 'B': case 'C': case 'D': case 'E': |
|
| 44 |
+ case 'H': case 'I': case 'J': case 'K': case 'M': |
|
| 45 |
+ case 'N': case 'O': case 'S': case 'T': case 'Z': |
|
| 46 |
+ return ++haystack; |
|
| 47 |
+ default: |
|
| 48 |
+ return haystack; |
|
| 49 |
+ }; break; |
|
| 50 |
+ case state_in_csi: switch (*haystack) {
|
|
| 51 |
+ case ';': case '?': |
|
| 52 |
+ case '0': case '1': case '2': case '3': case '4': |
|
| 53 |
+ case '5': case '6': case '7': case '8': case '9': |
|
| 54 |
+ break; |
|
| 55 |
+ default: |
|
| 56 |
+ return ++haystack; |
|
| 57 |
+ }; break; |
|
| 58 |
+ case state_start_osc: switch (*haystack) {
|
|
| 59 |
+ case '0': case '1': case '2': case '3': case '4': |
|
| 60 |
+ case '5': case '6': case '7': case '8': case '9': |
|
| 61 |
+ state = state_in_osc; break; |
|
| 62 |
+ default: |
|
| 63 |
+ return ++haystack; |
|
| 64 |
+ }; break; |
|
| 65 |
+ case state_in_osc: switch (*haystack) {
|
|
| 66 |
+ case '\x7': |
|
| 67 |
+ return ++haystack; |
|
| 68 |
+ case '\x1b': |
|
| 69 |
+ state = state_ignore_next; break; |
|
| 70 |
+ }; break; |
|
| 71 |
+ case state_ignore_next: |
|
| 72 |
+ return ++haystack; |
|
| 73 |
+ } while (*++haystack); |
|
| 74 |
+ return haystack; |
|
| 75 |
+} |
|
| 76 |
+ |
|
| 14 | 77 |
char *strcasechr(const char *s, char c) {
|
| 15 | 78 |
const char accept[3] = {c, toupper(c), 0};
|
| 16 | 79 |
return strpbrk(s, accept); |
| 17 | 80 |
} |
| 18 | 81 |
|
| 19 | 82 |
int has_match(const char *needle, const char *haystack) {
|
| 83 |
+ haystack = ignore_escapes(haystack); |
|
| 20 | 84 |
while (*needle) {
|
| 21 | 85 |
char nch = *needle++; |
| 22 | 86 |
|
| 23 | 87 |
if (!(haystack = strcasechr(haystack, nch))) {
|
| 24 | 88 |
return 0; |
| 25 | 89 |
} |
| 26 |
- haystack++; |
|
| 90 |
+ haystack = ignore_escapes(++haystack); |
|
| 27 | 91 |
} |
| 28 | 92 |
return 1; |
| 29 | 93 |
} |
| ... | ... |
@@ -79,7 +79,8 @@ static inline void match_row(const struct match_struct *match, int row, score_t |
| 79 | 79 |
score_t prev_score = SCORE_MIN; |
| 80 | 80 |
score_t gap_score = i == n - 1 ? SCORE_GAP_TRAILING : SCORE_GAP_INNER; |
| 81 | 81 |
|
| 82 |
- score_t prev_M, prev_D; |
|
| 82 |
+ /* These will not be used with this value, but not all compilers see it */ |
|
| 83 |
+ score_t prev_M = SCORE_MIN, prev_D = SCORE_MIN; |
|
| 83 | 84 |
|
| 84 | 85 |
for (int j = 0; j < m; j++) {
|
| 85 | 86 |
if (lower_needle[i] == lower_haystack[j]) {
|
| ... | ... |
@@ -28,8 +28,6 @@ int has_match(const char *needle, const char *haystack) {
|
| 28 | 28 |
return 1; |
| 29 | 29 |
} |
| 30 | 30 |
|
| 31 |
-#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
|
|
| 32 |
- |
|
| 33 | 31 |
#define max(a, b) (((a) > (b)) ? (a) : (b)) |
| 34 | 32 |
|
| 35 | 33 |
struct match_struct {
|
| ... | ... |
@@ -81,6 +79,8 @@ static inline void match_row(const struct match_struct *match, int row, score_t |
| 81 | 79 |
score_t prev_score = SCORE_MIN; |
| 82 | 80 |
score_t gap_score = i == n - 1 ? SCORE_GAP_TRAILING : SCORE_GAP_INNER; |
| 83 | 81 |
|
| 82 |
+ score_t prev_M, prev_D; |
|
| 83 |
+ |
|
| 84 | 84 |
for (int j = 0; j < m; j++) {
|
| 85 | 85 |
if (lower_needle[i] == lower_haystack[j]) {
|
| 86 | 86 |
score_t score = SCORE_MIN; |
| ... | ... |
@@ -88,14 +88,18 @@ static inline void match_row(const struct match_struct *match, int row, score_t |
| 88 | 88 |
score = (j * SCORE_GAP_LEADING) + match_bonus[j]; |
| 89 | 89 |
} else if (j) { /* i > 0 && j > 0*/
|
| 90 | 90 |
score = max( |
| 91 |
- last_M[j - 1] + match_bonus[j], |
|
| 91 |
+ prev_M + match_bonus[j], |
|
| 92 | 92 |
|
| 93 | 93 |
/* consecutive match, doesn't stack with match_bonus */ |
| 94 |
- last_D[j - 1] + SCORE_MATCH_CONSECUTIVE); |
|
| 94 |
+ prev_D + SCORE_MATCH_CONSECUTIVE); |
|
| 95 | 95 |
} |
| 96 |
+ prev_D = last_D[j]; |
|
| 97 |
+ prev_M = last_M[j]; |
|
| 96 | 98 |
curr_D[j] = score; |
| 97 | 99 |
curr_M[j] = prev_score = max(score, prev_score + gap_score); |
| 98 | 100 |
} else {
|
| 101 |
+ prev_D = last_D[j]; |
|
| 102 |
+ prev_M = last_M[j]; |
|
| 99 | 103 |
curr_D[j] = SCORE_MIN; |
| 100 | 104 |
curr_M[j] = prev_score = prev_score + gap_score; |
| 101 | 105 |
} |
| ... | ... |
@@ -131,24 +135,13 @@ score_t match(const char *needle, const char *haystack) {
|
| 131 | 135 |
* D[][] Stores the best score for this position ending with a match. |
| 132 | 136 |
* M[][] Stores the best possible score at this position. |
| 133 | 137 |
*/ |
| 134 |
- score_t D[2][MATCH_MAX_LEN], M[2][MATCH_MAX_LEN]; |
|
| 135 |
- |
|
| 136 |
- score_t *last_D, *last_M; |
|
| 137 |
- score_t *curr_D, *curr_M; |
|
| 138 |
- |
|
| 139 |
- last_D = D[0]; |
|
| 140 |
- last_M = M[0]; |
|
| 141 |
- curr_D = D[1]; |
|
| 142 |
- curr_M = M[1]; |
|
| 138 |
+ score_t D[MATCH_MAX_LEN], M[MATCH_MAX_LEN]; |
|
| 143 | 139 |
|
| 144 | 140 |
for (int i = 0; i < n; i++) {
|
| 145 |
- match_row(&match, i, curr_D, curr_M, last_D, last_M); |
|
| 146 |
- |
|
| 147 |
- SWAP(curr_D, last_D, score_t *); |
|
| 148 |
- SWAP(curr_M, last_M, score_t *); |
|
| 141 |
+ match_row(&match, i, D, M, D, M); |
|
| 149 | 142 |
} |
| 150 | 143 |
|
| 151 |
- return last_M[m - 1]; |
|
| 144 |
+ return M[m - 1]; |
|
| 152 | 145 |
} |
| 153 | 146 |
|
| 154 | 147 |
score_t match_positions(const char *needle, const char *haystack, size_t *positions) {
|
| ... | ... |
@@ -187,17 +180,9 @@ score_t match_positions(const char *needle, const char *haystack, size_t *positi |
| 187 | 180 |
M = malloc(sizeof(score_t) * MATCH_MAX_LEN * n); |
| 188 | 181 |
D = malloc(sizeof(score_t) * MATCH_MAX_LEN * n); |
| 189 | 182 |
|
| 190 |
- score_t *last_D = NULL, *last_M = NULL; |
|
| 191 |
- score_t *curr_D, *curr_M; |
|
| 192 |
- |
|
| 193 |
- for (int i = 0; i < n; i++) {
|
|
| 194 |
- curr_D = &D[i][0]; |
|
| 195 |
- curr_M = &M[i][0]; |
|
| 196 |
- |
|
| 197 |
- match_row(&match, i, curr_D, curr_M, last_D, last_M); |
|
| 198 |
- |
|
| 199 |
- last_D = curr_D; |
|
| 200 |
- last_M = curr_M; |
|
| 183 |
+ match_row(&match, 0, D[0], M[0], D[0], M[0]); |
|
| 184 |
+ for (int i = 1; i < n; i++) {
|
|
| 185 |
+ match_row(&match, i, D[i], M[i], D[i - 1], M[i - 1]); |
|
| 201 | 186 |
} |
| 202 | 187 |
|
| 203 | 188 |
/* backtrace to find the positions of optimal matching */ |
These pointers were never dereferenced in their uninitialized form, as
that only happens for i > 0, but we should do this to avoid the compiler
warning.
| ... | ... |
@@ -187,7 +187,7 @@ score_t match_positions(const char *needle, const char *haystack, size_t *positi |
| 187 | 187 |
M = malloc(sizeof(score_t) * MATCH_MAX_LEN * n); |
| 188 | 188 |
D = malloc(sizeof(score_t) * MATCH_MAX_LEN * n); |
| 189 | 189 |
|
| 190 |
- score_t *last_D, *last_M; |
|
| 190 |
+ score_t *last_D = NULL, *last_M = NULL; |
|
| 191 | 191 |
score_t *curr_D, *curr_M; |
| 192 | 192 |
|
| 193 | 193 |
for (int i = 0; i < n; i++) {
|
| ... | ... |
@@ -56,7 +56,7 @@ static void setup_match_struct(struct match_struct *match, const char *needle, c |
| 56 | 56 |
match->needle_len = strlen(needle); |
| 57 | 57 |
match->haystack_len = strlen(haystack); |
| 58 | 58 |
|
| 59 |
- if (match->needle_len > MATCH_MAX_LEN || match->needle_len > match->haystack_len) {
|
|
| 59 |
+ if (match->haystack_len > MATCH_MAX_LEN || match->needle_len > match->haystack_len) {
|
|
| 60 | 60 |
return; |
| 61 | 61 |
} |
| 62 | 62 |
|
Pretty sure this was optimized out anyways, but this is cleaner IMO.
| ... | ... |
@@ -46,9 +46,8 @@ struct match_struct {
|
| 46 | 46 |
|
| 47 | 47 |
static void precompute_bonus(const char *haystack, score_t *match_bonus) {
|
| 48 | 48 |
/* Which positions are beginning of words */ |
| 49 |
- int m = strlen(haystack); |
|
| 50 | 49 |
char last_ch = '/'; |
| 51 |
- for (int i = 0; i < m; i++) {
|
|
| 50 |
+ for (int i = 0; haystack[i]; i++) {
|
|
| 52 | 51 |
char ch = haystack[i]; |
| 53 | 52 |
match_bonus[i] = COMPUTE_BONUS(last_ch, ch); |
| 54 | 53 |
last_ch = ch; |
| ... | ... |
@@ -186,7 +186,9 @@ score_t match_positions(const char *needle, const char *haystack, size_t *positi |
| 186 | 186 |
* D[][] Stores the best score for this position ending with a match. |
| 187 | 187 |
* M[][] Stores the best possible score at this position. |
| 188 | 188 |
*/ |
| 189 |
- score_t D[n][m], M[n][m]; |
|
| 189 |
+ score_t (*D)[MATCH_MAX_LEN], (*M)[MATCH_MAX_LEN]; |
|
| 190 |
+ M = malloc(sizeof(score_t) * MATCH_MAX_LEN * n); |
|
| 191 |
+ D = malloc(sizeof(score_t) * MATCH_MAX_LEN * n); |
|
| 190 | 192 |
|
| 191 | 193 |
score_t *last_D, *last_M; |
| 192 | 194 |
score_t *curr_D, *curr_M; |
| ... | ... |
@@ -230,5 +232,10 @@ score_t match_positions(const char *needle, const char *haystack, size_t *positi |
| 230 | 232 |
} |
| 231 | 233 |
} |
| 232 | 234 |
|
| 233 |
- return M[n - 1][m - 1]; |
|
| 235 |
+ score_t result = M[n - 1][m - 1]; |
|
| 236 |
+ |
|
| 237 |
+ free(M); |
|
| 238 |
+ free(D); |
|
| 239 |
+ |
|
| 240 |
+ return result; |
|
| 234 | 241 |
} |
| ... | ... |
@@ -32,33 +32,6 @@ int has_match(const char *needle, const char *haystack) {
|
| 32 | 32 |
|
| 33 | 33 |
#define max(a, b) (((a) > (b)) ? (a) : (b)) |
| 34 | 34 |
|
| 35 |
-#ifdef DEBUG_VERBOSE |
|
| 36 |
-/* print one of the internal matrices */ |
|
| 37 |
-void mat_print(score_t *mat, char name, const char *needle, const char *haystack) {
|
|
| 38 |
- int n = strlen(needle); |
|
| 39 |
- int m = strlen(haystack); |
|
| 40 |
- int i, j; |
|
| 41 |
- fprintf(stderr, "%c ", name); |
|
| 42 |
- for (j = 0; j < m; j++) {
|
|
| 43 |
- fprintf(stderr, " %c", haystack[j]); |
|
| 44 |
- } |
|
| 45 |
- fprintf(stderr, "\n"); |
|
| 46 |
- for (i = 0; i < n; i++) {
|
|
| 47 |
- fprintf(stderr, " %c |", needle[i]); |
|
| 48 |
- for (j = 0; j < m; j++) {
|
|
| 49 |
- score_t val = mat[i * m + j]; |
|
| 50 |
- if (val == SCORE_MIN) {
|
|
| 51 |
- fprintf(stderr, " -\u221E"); |
|
| 52 |
- } else {
|
|
| 53 |
- fprintf(stderr, " %.3f", val); |
|
| 54 |
- } |
|
| 55 |
- } |
|
| 56 |
- fprintf(stderr, "\n"); |
|
| 57 |
- } |
|
| 58 |
- fprintf(stderr, "\n\n"); |
|
| 59 |
-} |
|
| 60 |
-#endif |
|
| 61 |
- |
|
| 62 | 35 |
#define MATCH_MAX_LEN 1024 |
| 63 | 36 |
|
| 64 | 37 |
struct match_struct {
|
| ... | ... |
@@ -228,13 +201,6 @@ score_t match_positions(const char *needle, const char *haystack, size_t *positi |
| 228 | 201 |
last_M = curr_M; |
| 229 | 202 |
} |
| 230 | 203 |
|
| 231 |
-#ifdef DEBUG_VERBOSE |
|
| 232 |
- fprintf(stderr, "\"%s\" =~ \"%s\"\n", needle, haystack); |
|
| 233 |
- mat_print(&D[0][0], 'D', needle, haystack); |
|
| 234 |
- mat_print(&M[0][0], 'M', needle, haystack); |
|
| 235 |
- fprintf(stderr, "\n"); |
|
| 236 |
-#endif |
|
| 237 |
- |
|
| 238 | 204 |
/* backtrace to find the positions of optimal matching */ |
| 239 | 205 |
if (positions) {
|
| 240 | 206 |
int match_required = 0; |
When we're matching without recording positions, we only need to keep
two rows in memory at a time.
| ... | ... |
@@ -28,6 +28,8 @@ int has_match(const char *needle, const char *haystack) {
|
| 28 | 28 |
return 1; |
| 29 | 29 |
} |
| 30 | 30 |
|
| 31 |
+#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
|
|
| 32 |
+ |
|
| 31 | 33 |
#define max(a, b) (((a) > (b)) ? (a) : (b)) |
| 32 | 34 |
|
| 33 | 35 |
#ifdef DEBUG_VERBOSE |
| ... | ... |
@@ -159,22 +161,24 @@ score_t match(const char *needle, const char *haystack) {
|
| 159 | 161 |
* D[][] Stores the best score for this position ending with a match. |
| 160 | 162 |
* M[][] Stores the best possible score at this position. |
| 161 | 163 |
*/ |
| 162 |
- score_t D[n][m], M[n][m]; |
|
| 164 |
+ score_t D[2][MATCH_MAX_LEN], M[2][MATCH_MAX_LEN]; |
|
| 163 | 165 |
|
| 164 | 166 |
score_t *last_D, *last_M; |
| 165 | 167 |
score_t *curr_D, *curr_M; |
| 166 | 168 |
|
| 167 |
- for (int i = 0; i < n; i++) {
|
|
| 168 |
- curr_D = &D[i][0]; |
|
| 169 |
- curr_M = &M[i][0]; |
|
| 169 |
+ last_D = D[0]; |
|
| 170 |
+ last_M = M[0]; |
|
| 171 |
+ curr_D = D[1]; |
|
| 172 |
+ curr_M = M[1]; |
|
| 170 | 173 |
|
| 174 |
+ for (int i = 0; i < n; i++) {
|
|
| 171 | 175 |
match_row(&match, i, curr_D, curr_M, last_D, last_M); |
| 172 | 176 |
|
| 173 |
- last_D = curr_D; |
|
| 174 |
- last_M = curr_M; |
|
| 177 |
+ SWAP(curr_D, last_D, score_t *); |
|
| 178 |
+ SWAP(curr_M, last_M, score_t *); |
|
| 175 | 179 |
} |
| 176 | 180 |
|
| 177 |
- return M[n - 1][m - 1]; |
|
| 181 |
+ return last_M[m - 1]; |
|
| 178 | 182 |
} |
| 179 | 183 |
|
| 180 | 184 |
score_t match_positions(const char *needle, const char *haystack, size_t *positions) {
|
| ... | ... |
@@ -4,6 +4,7 @@ |
| 4 | 4 |
#include <stdio.h> |
| 5 | 5 |
#include <float.h> |
| 6 | 6 |
#include <math.h> |
| 7 |
+#include <stdlib.h> |
|
| 7 | 8 |
|
| 8 | 9 |
#include "match.h" |
| 9 | 10 |
#include "bonus.h" |
| ... | ... |
@@ -129,6 +130,53 @@ static inline void match_row(const struct match_struct *match, int row, score_t |
| 129 | 130 |
} |
| 130 | 131 |
} |
| 131 | 132 |
|
| 133 |
+score_t match(const char *needle, const char *haystack) {
|
|
| 134 |
+ if (!*needle) |
|
| 135 |
+ return SCORE_MIN; |
|
| 136 |
+ |
|
| 137 |
+ struct match_struct match; |
|
| 138 |
+ setup_match_struct(&match, needle, haystack); |
|
| 139 |
+ |
|
| 140 |
+ int n = match.needle_len; |
|
| 141 |
+ int m = match.haystack_len; |
|
| 142 |
+ |
|
| 143 |
+ if (m > MATCH_MAX_LEN || n > m) {
|
|
| 144 |
+ /* |
|
| 145 |
+ * Unreasonably large candidate: return no score |
|
| 146 |
+ * If it is a valid match it will still be returned, it will |
|
| 147 |
+ * just be ranked below any reasonably sized candidates |
|
| 148 |
+ */ |
|
| 149 |
+ return SCORE_MIN; |
|
| 150 |
+ } else if (n == m) {
|
|
| 151 |
+ /* Since this method can only be called with a haystack which |
|
| 152 |
+ * matches needle. If the lengths of the strings are equal the |
|
| 153 |
+ * strings themselves must also be equal (ignoring case). |
|
| 154 |
+ */ |
|
| 155 |
+ return SCORE_MAX; |
|
| 156 |
+ } |
|
| 157 |
+ |
|
| 158 |
+ /* |
|
| 159 |
+ * D[][] Stores the best score for this position ending with a match. |
|
| 160 |
+ * M[][] Stores the best possible score at this position. |
|
| 161 |
+ */ |
|
| 162 |
+ score_t D[n][m], M[n][m]; |
|
| 163 |
+ |
|
| 164 |
+ score_t *last_D, *last_M; |
|
| 165 |
+ score_t *curr_D, *curr_M; |
|
| 166 |
+ |
|
| 167 |
+ for (int i = 0; i < n; i++) {
|
|
| 168 |
+ curr_D = &D[i][0]; |
|
| 169 |
+ curr_M = &M[i][0]; |
|
| 170 |
+ |
|
| 171 |
+ match_row(&match, i, curr_D, curr_M, last_D, last_M); |
|
| 172 |
+ |
|
| 173 |
+ last_D = curr_D; |
|
| 174 |
+ last_M = curr_M; |
|
| 175 |
+ } |
|
| 176 |
+ |
|
| 177 |
+ return M[n - 1][m - 1]; |
|
| 178 |
+} |
|
| 179 |
+ |
|
| 132 | 180 |
score_t match_positions(const char *needle, const char *haystack, size_t *positions) {
|
| 133 | 181 |
if (!*needle) |
| 134 | 182 |
return SCORE_MIN; |
| ... | ... |
@@ -214,7 +262,3 @@ score_t match_positions(const char *needle, const char *haystack, size_t *positi |
| 214 | 262 |
|
| 215 | 263 |
return M[n - 1][m - 1]; |
| 216 | 264 |
} |
| 217 |
- |
|
| 218 |
-score_t match(const char *needle, const char *haystack) {
|
|
| 219 |
- return match_positions(needle, haystack, NULL); |
|
| 220 |
-} |
| ... | ... |
@@ -96,6 +96,39 @@ static void setup_match_struct(struct match_struct *match, const char *needle, c |
| 96 | 96 |
precompute_bonus(haystack, match->match_bonus); |
| 97 | 97 |
} |
| 98 | 98 |
|
| 99 |
+static inline void match_row(const struct match_struct *match, int row, score_t *curr_D, score_t *curr_M, const score_t *last_D, const score_t *last_M) {
|
|
| 100 |
+ int n = match->needle_len; |
|
| 101 |
+ int m = match->haystack_len; |
|
| 102 |
+ int i = row; |
|
| 103 |
+ |
|
| 104 |
+ const char *lower_needle = match->lower_needle; |
|
| 105 |
+ const char *lower_haystack = match->lower_haystack; |
|
| 106 |
+ const score_t *match_bonus = match->match_bonus; |
|
| 107 |
+ |
|
| 108 |
+ score_t prev_score = SCORE_MIN; |
|
| 109 |
+ score_t gap_score = i == n - 1 ? SCORE_GAP_TRAILING : SCORE_GAP_INNER; |
|
| 110 |
+ |
|
| 111 |
+ for (int j = 0; j < m; j++) {
|
|
| 112 |
+ if (lower_needle[i] == lower_haystack[j]) {
|
|
| 113 |
+ score_t score = SCORE_MIN; |
|
| 114 |
+ if (!i) {
|
|
| 115 |
+ score = (j * SCORE_GAP_LEADING) + match_bonus[j]; |
|
| 116 |
+ } else if (j) { /* i > 0 && j > 0*/
|
|
| 117 |
+ score = max( |
|
| 118 |
+ last_M[j - 1] + match_bonus[j], |
|
| 119 |
+ |
|
| 120 |
+ /* consecutive match, doesn't stack with match_bonus */ |
|
| 121 |
+ last_D[j - 1] + SCORE_MATCH_CONSECUTIVE); |
|
| 122 |
+ } |
|
| 123 |
+ curr_D[j] = score; |
|
| 124 |
+ curr_M[j] = prev_score = max(score, prev_score + gap_score); |
|
| 125 |
+ } else {
|
|
| 126 |
+ curr_D[j] = SCORE_MIN; |
|
| 127 |
+ curr_M[j] = prev_score = prev_score + gap_score; |
|
| 128 |
+ } |
|
| 129 |
+ } |
|
| 130 |
+} |
|
| 131 |
+ |
|
| 99 | 132 |
score_t match_positions(const char *needle, const char *haystack, size_t *positions) {
|
| 100 | 133 |
if (!*needle) |
| 101 | 134 |
return SCORE_MIN; |
| ... | ... |
@@ -124,10 +157,6 @@ score_t match_positions(const char *needle, const char *haystack, size_t *positi |
| 124 | 157 |
return SCORE_MAX; |
| 125 | 158 |
} |
| 126 | 159 |
|
| 127 |
- const char *lower_needle = match.lower_needle; |
|
| 128 |
- const char *lower_haystack = match.lower_haystack; |
|
| 129 |
- const score_t *match_bonus = match.match_bonus; |
|
| 130 |
- |
|
| 131 | 160 |
/* |
| 132 | 161 |
* D[][] Stores the best score for this position ending with a match. |
| 133 | 162 |
* M[][] Stores the best possible score at this position. |
| ... | ... |
@@ -138,31 +167,10 @@ score_t match_positions(const char *needle, const char *haystack, size_t *positi |
| 138 | 167 |
score_t *curr_D, *curr_M; |
| 139 | 168 |
|
| 140 | 169 |
for (int i = 0; i < n; i++) {
|
| 141 |
- score_t prev_score = SCORE_MIN; |
|
| 142 |
- score_t gap_score = i == n - 1 ? SCORE_GAP_TRAILING : SCORE_GAP_INNER; |
|
| 143 |
- |
|
| 144 | 170 |
curr_D = &D[i][0]; |
| 145 | 171 |
curr_M = &M[i][0]; |
| 146 | 172 |
|
| 147 |
- for (int j = 0; j < m; j++) {
|
|
| 148 |
- if (lower_needle[i] == lower_haystack[j]) {
|
|
| 149 |
- score_t score = SCORE_MIN; |
|
| 150 |
- if (!i) {
|
|
| 151 |
- score = (j * SCORE_GAP_LEADING) + match_bonus[j]; |
|
| 152 |
- } else if (j) { /* i > 0 && j > 0*/
|
|
| 153 |
- score = max( |
|
| 154 |
- last_M[j - 1] + match_bonus[j], |
|
| 155 |
- |
|
| 156 |
- /* consecutive match, doesn't stack with match_bonus */ |
|
| 157 |
- last_D[j - 1] + SCORE_MATCH_CONSECUTIVE); |
|
| 158 |
- } |
|
| 159 |
- curr_D[j] = score; |
|
| 160 |
- curr_M[j] = prev_score = max(score, prev_score + gap_score); |
|
| 161 |
- } else {
|
|
| 162 |
- curr_D[j] = SCORE_MIN; |
|
| 163 |
- curr_M[j] = prev_score = prev_score + gap_score; |
|
| 164 |
- } |
|
| 165 |
- } |
|
| 173 |
+ match_row(&match, i, curr_D, curr_M, last_D, last_M); |
|
| 166 | 174 |
|
| 167 | 175 |
last_D = curr_D; |
| 168 | 176 |
last_M = curr_M; |
| ... | ... |
@@ -58,6 +58,16 @@ void mat_print(score_t *mat, char name, const char *needle, const char *haystack |
| 58 | 58 |
|
| 59 | 59 |
#define MATCH_MAX_LEN 1024 |
| 60 | 60 |
|
| 61 |
+struct match_struct {
|
|
| 62 |
+ int needle_len; |
|
| 63 |
+ int haystack_len; |
|
| 64 |
+ |
|
| 65 |
+ char lower_needle[MATCH_MAX_LEN]; |
|
| 66 |
+ char lower_haystack[MATCH_MAX_LEN]; |
|
| 67 |
+ |
|
| 68 |
+ score_t match_bonus[MATCH_MAX_LEN]; |
|
| 69 |
+}; |
|
| 70 |
+ |
|
| 61 | 71 |
static void precompute_bonus(const char *haystack, score_t *match_bonus) {
|
| 62 | 72 |
/* Which positions are beginning of words */ |
| 63 | 73 |
int m = strlen(haystack); |
| ... | ... |
@@ -69,12 +79,32 @@ static void precompute_bonus(const char *haystack, score_t *match_bonus) {
|
| 69 | 79 |
} |
| 70 | 80 |
} |
| 71 | 81 |
|
| 82 |
+static void setup_match_struct(struct match_struct *match, const char *needle, const char *haystack) {
|
|
| 83 |
+ match->needle_len = strlen(needle); |
|
| 84 |
+ match->haystack_len = strlen(haystack); |
|
| 85 |
+ |
|
| 86 |
+ if (match->needle_len > MATCH_MAX_LEN || match->needle_len > match->haystack_len) {
|
|
| 87 |
+ return; |
|
| 88 |
+ } |
|
| 89 |
+ |
|
| 90 |
+ for (int i = 0; i < match->needle_len; i++) |
|
| 91 |
+ match->lower_needle[i] = tolower(needle[i]); |
|
| 92 |
+ |
|
| 93 |
+ for (int i = 0; i < match->haystack_len; i++) |
|
| 94 |
+ match->lower_haystack[i] = tolower(haystack[i]); |
|
| 95 |
+ |
|
| 96 |
+ precompute_bonus(haystack, match->match_bonus); |
|
| 97 |
+} |
|
| 98 |
+ |
|
| 72 | 99 |
score_t match_positions(const char *needle, const char *haystack, size_t *positions) {
|
| 73 | 100 |
if (!*needle) |
| 74 | 101 |
return SCORE_MIN; |
| 75 | 102 |
|
| 76 |
- int n = strlen(needle); |
|
| 77 |
- int m = strlen(haystack); |
|
| 103 |
+ struct match_struct match; |
|
| 104 |
+ setup_match_struct(&match, needle, haystack); |
|
| 105 |
+ |
|
| 106 |
+ int n = match.needle_len; |
|
| 107 |
+ int m = match.haystack_len; |
|
| 78 | 108 |
|
| 79 | 109 |
if (m > MATCH_MAX_LEN || n > m) {
|
| 80 | 110 |
/* |
| ... | ... |
@@ -94,26 +124,18 @@ score_t match_positions(const char *needle, const char *haystack, size_t *positi |
| 94 | 124 |
return SCORE_MAX; |
| 95 | 125 |
} |
| 96 | 126 |
|
| 97 |
- char lower_needle[MATCH_MAX_LEN]; |
|
| 98 |
- char lower_haystack[MATCH_MAX_LEN]; |
|
| 99 |
- |
|
| 100 |
- for (int i = 0; i < n; i++) |
|
| 101 |
- lower_needle[i] = tolower(needle[i]); |
|
| 102 |
- |
|
| 103 |
- for (int i = 0; i < m; i++) |
|
| 104 |
- lower_haystack[i] = tolower(haystack[i]); |
|
| 105 |
- |
|
| 106 |
- score_t match_bonus[MATCH_MAX_LEN]; |
|
| 107 |
- score_t D[n][m], M[n][m]; |
|
| 108 |
- |
|
| 109 |
- score_t *last_D, *last_M; |
|
| 110 |
- score_t *curr_D, *curr_M; |
|
| 127 |
+ const char *lower_needle = match.lower_needle; |
|
| 128 |
+ const char *lower_haystack = match.lower_haystack; |
|
| 129 |
+ const score_t *match_bonus = match.match_bonus; |
|
| 111 | 130 |
|
| 112 | 131 |
/* |
| 113 | 132 |
* D[][] Stores the best score for this position ending with a match. |
| 114 | 133 |
* M[][] Stores the best possible score at this position. |
| 115 | 134 |
*/ |
| 116 |
- precompute_bonus(haystack, match_bonus); |
|
| 135 |
+ score_t D[n][m], M[n][m]; |
|
| 136 |
+ |
|
| 137 |
+ score_t *last_D, *last_M; |
|
| 138 |
+ score_t *curr_D, *curr_M; |
|
| 117 | 139 |
|
| 118 | 140 |
for (int i = 0; i < n; i++) {
|
| 119 | 141 |
score_t prev_score = SCORE_MIN; |
This should never happen if this is called properly (match only returns
valid results if there IS a match), but it's nice to double check here.
| ... | ... |
@@ -76,7 +76,7 @@ score_t match_positions(const char *needle, const char *haystack, size_t *positi |
| 76 | 76 |
int n = strlen(needle); |
| 77 | 77 |
int m = strlen(haystack); |
| 78 | 78 |
|
| 79 |
- if (m > MATCH_MAX_LEN || n > MATCH_MAX_LEN) {
|
|
| 79 |
+ if (m > MATCH_MAX_LEN || n > m) {
|
|
| 80 | 80 |
/* |
| 81 | 81 |
* Unreasonably large candidate: return no score |
| 82 | 82 |
* If it is a valid match it will still be returned, it will |
| ... | ... |
@@ -83,9 +83,7 @@ score_t match_positions(const char *needle, const char *haystack, size_t *positi |
| 83 | 83 |
* just be ranked below any reasonably sized candidates |
| 84 | 84 |
*/ |
| 85 | 85 |
return SCORE_MIN; |
| 86 |
- } |
|
| 87 |
- |
|
| 88 |
- if (n == m) {
|
|
| 86 |
+ } else if (n == m) {
|
|
| 89 | 87 |
/* Since this method can only be called with a haystack which |
| 90 | 88 |
* matches needle. If the lengths of the strings are equal the |
| 91 | 89 |
* strings themselves must also be equal (ignoring case). |
| ... | ... |
@@ -108,6 +108,9 @@ score_t match_positions(const char *needle, const char *haystack, size_t *positi |
| 108 | 108 |
score_t match_bonus[MATCH_MAX_LEN]; |
| 109 | 109 |
score_t D[n][m], M[n][m]; |
| 110 | 110 |
|
| 111 |
+ score_t *last_D, *last_M; |
|
| 112 |
+ score_t *curr_D, *curr_M; |
|
| 113 |
+ |
|
| 111 | 114 |
/* |
| 112 | 115 |
* D[][] Stores the best score for this position ending with a match. |
| 113 | 116 |
* M[][] Stores the best possible score at this position. |
| ... | ... |
@@ -118,6 +121,9 @@ score_t match_positions(const char *needle, const char *haystack, size_t *positi |
| 118 | 121 |
score_t prev_score = SCORE_MIN; |
| 119 | 122 |
score_t gap_score = i == n - 1 ? SCORE_GAP_TRAILING : SCORE_GAP_INNER; |
| 120 | 123 |
|
| 124 |
+ curr_D = &D[i][0]; |
|
| 125 |
+ curr_M = &M[i][0]; |
|
| 126 |
+ |
|
| 121 | 127 |
for (int j = 0; j < m; j++) {
|
| 122 | 128 |
if (lower_needle[i] == lower_haystack[j]) {
|
| 123 | 129 |
score_t score = SCORE_MIN; |
| ... | ... |
@@ -125,18 +131,21 @@ score_t match_positions(const char *needle, const char *haystack, size_t *positi |
| 125 | 131 |
score = (j * SCORE_GAP_LEADING) + match_bonus[j]; |
| 126 | 132 |
} else if (j) { /* i > 0 && j > 0*/
|
| 127 | 133 |
score = max( |
| 128 |
- M[i - 1][j - 1] + match_bonus[j], |
|
| 134 |
+ last_M[j - 1] + match_bonus[j], |
|
| 129 | 135 |
|
| 130 | 136 |
/* consecutive match, doesn't stack with match_bonus */ |
| 131 |
- D[i - 1][j - 1] + SCORE_MATCH_CONSECUTIVE); |
|
| 137 |
+ last_D[j - 1] + SCORE_MATCH_CONSECUTIVE); |
|
| 132 | 138 |
} |
| 133 |
- D[i][j] = score; |
|
| 134 |
- M[i][j] = prev_score = max(score, prev_score + gap_score); |
|
| 139 |
+ curr_D[j] = score; |
|
| 140 |
+ curr_M[j] = prev_score = max(score, prev_score + gap_score); |
|
| 135 | 141 |
} else {
|
| 136 |
- D[i][j] = SCORE_MIN; |
|
| 137 |
- M[i][j] = prev_score = prev_score + gap_score; |
|
| 142 |
+ curr_D[j] = SCORE_MIN; |
|
| 143 |
+ curr_M[j] = prev_score = prev_score + gap_score; |
|
| 138 | 144 |
} |
| 139 | 145 |
} |
| 146 |
+ |
|
| 147 |
+ last_D = curr_D; |
|
| 148 |
+ last_M = curr_M; |
|
| 140 | 149 |
} |
| 141 | 150 |
|
| 142 | 151 |
#ifdef DEBUG_VERBOSE |
| ... | ... |
@@ -105,7 +105,7 @@ score_t match_positions(const char *needle, const char *haystack, size_t *positi |
| 105 | 105 |
for (int i = 0; i < m; i++) |
| 106 | 106 |
lower_haystack[i] = tolower(haystack[i]); |
| 107 | 107 |
|
| 108 |
- score_t match_bonus[m]; |
|
| 108 |
+ score_t match_bonus[MATCH_MAX_LEN]; |
|
| 109 | 109 |
score_t D[n][m], M[n][m]; |
| 110 | 110 |
|
| 111 | 111 |
/* |
| ... | ... |
@@ -56,6 +56,8 @@ void mat_print(score_t *mat, char name, const char *needle, const char *haystack |
| 56 | 56 |
} |
| 57 | 57 |
#endif |
| 58 | 58 |
|
| 59 |
+#define MATCH_MAX_LEN 1024 |
|
| 60 |
+ |
|
| 59 | 61 |
static void precompute_bonus(const char *haystack, score_t *match_bonus) {
|
| 60 | 62 |
/* Which positions are beginning of words */ |
| 61 | 63 |
int m = strlen(haystack); |
| ... | ... |
@@ -74,6 +76,15 @@ score_t match_positions(const char *needle, const char *haystack, size_t *positi |
| 74 | 76 |
int n = strlen(needle); |
| 75 | 77 |
int m = strlen(haystack); |
| 76 | 78 |
|
| 79 |
+ if (m > MATCH_MAX_LEN || n > MATCH_MAX_LEN) {
|
|
| 80 |
+ /* |
|
| 81 |
+ * Unreasonably large candidate: return no score |
|
| 82 |
+ * If it is a valid match it will still be returned, it will |
|
| 83 |
+ * just be ranked below any reasonably sized candidates |
|
| 84 |
+ */ |
|
| 85 |
+ return SCORE_MIN; |
|
| 86 |
+ } |
|
| 87 |
+ |
|
| 77 | 88 |
if (n == m) {
|
| 78 | 89 |
/* Since this method can only be called with a haystack which |
| 79 | 90 |
* matches needle. If the lengths of the strings are equal the |
| ... | ... |
@@ -85,17 +96,8 @@ score_t match_positions(const char *needle, const char *haystack, size_t *positi |
| 85 | 96 |
return SCORE_MAX; |
| 86 | 97 |
} |
| 87 | 98 |
|
| 88 |
- if (m > 1024) {
|
|
| 89 |
- /* |
|
| 90 |
- * Unreasonably large candidate: return no score |
|
| 91 |
- * If it is a valid match it will still be returned, it will |
|
| 92 |
- * just be ranked below any reasonably sized candidates |
|
| 93 |
- */ |
|
| 94 |
- return SCORE_MIN; |
|
| 95 |
- } |
|
| 96 |
- |
|
| 97 |
- char lower_needle[n]; |
|
| 98 |
- char lower_haystack[m]; |
|
| 99 |
+ char lower_needle[MATCH_MAX_LEN]; |
|
| 100 |
+ char lower_haystack[MATCH_MAX_LEN]; |
|
| 99 | 101 |
|
| 100 | 102 |
for (int i = 0; i < n; i++) |
| 101 | 103 |
lower_needle[i] = tolower(needle[i]); |
This makes match_positions significantly faster on MacOS by calling
tolower() O(N) times instead of O(N*N)
Before:
$ time ./fzy -e linux --benchmark < linux_files.txt
./fzy -e linux --benchmark < linux_files.txt 13.24s user 0.03s system 381% cpu 3.483 total
After:
$ time ./fzy -e linux --benchmark < linux_files.txt
./fzy -e linux --benchmark < linux_files.txt 4.57s user 0.02s system 381% cpu 1.204 total
| ... | ... |
@@ -94,6 +94,15 @@ score_t match_positions(const char *needle, const char *haystack, size_t *positi |
| 94 | 94 |
return SCORE_MIN; |
| 95 | 95 |
} |
| 96 | 96 |
|
| 97 |
+ char lower_needle[n]; |
|
| 98 |
+ char lower_haystack[m]; |
|
| 99 |
+ |
|
| 100 |
+ for (int i = 0; i < n; i++) |
|
| 101 |
+ lower_needle[i] = tolower(needle[i]); |
|
| 102 |
+ |
|
| 103 |
+ for (int i = 0; i < m; i++) |
|
| 104 |
+ lower_haystack[i] = tolower(haystack[i]); |
|
| 105 |
+ |
|
| 97 | 106 |
score_t match_bonus[m]; |
| 98 | 107 |
score_t D[n][m], M[n][m]; |
| 99 | 108 |
|
| ... | ... |
@@ -108,7 +117,7 @@ score_t match_positions(const char *needle, const char *haystack, size_t *positi |
| 108 | 117 |
score_t gap_score = i == n - 1 ? SCORE_GAP_TRAILING : SCORE_GAP_INNER; |
| 109 | 118 |
|
| 110 | 119 |
for (int j = 0; j < m; j++) {
|
| 111 |
- if (tolower(needle[i]) == tolower(haystack[j])) {
|
|
| 120 |
+ if (lower_needle[i] == lower_haystack[j]) {
|
|
| 112 | 121 |
score_t score = SCORE_MIN; |
| 113 | 122 |
if (!i) {
|
| 114 | 123 |
score = (j * SCORE_GAP_LEADING) + match_bonus[j]; |
| ... | ... |
@@ -6,6 +6,7 @@ |
| 6 | 6 |
#include <math.h> |
| 7 | 7 |
|
| 8 | 8 |
#include "match.h" |
| 9 |
+#include "bonus.h" |
|
| 9 | 10 |
|
| 10 | 11 |
#include "../config.h" |
| 11 | 12 |
|
| ... | ... |
@@ -55,38 +56,13 @@ void mat_print(score_t *mat, char name, const char *needle, const char *haystack |
| 55 | 56 |
} |
| 56 | 57 |
#endif |
| 57 | 58 |
|
| 58 |
-const score_t bonus_states[][256] = {
|
|
| 59 |
- { 0 },
|
|
| 60 |
- {
|
|
| 61 |
- ['/'] = SCORE_MATCH_SLASH, |
|
| 62 |
- ['-'] = SCORE_MATCH_WORD, |
|
| 63 |
- ['_'] = SCORE_MATCH_WORD, |
|
| 64 |
- [' '] = SCORE_MATCH_WORD, |
|
| 65 |
- ['.'] = SCORE_MATCH_DOT, |
|
| 66 |
- }, |
|
| 67 |
- {
|
|
| 68 |
- ['a' ... 'z'] = SCORE_MATCH_CAPITAL, |
|
| 69 |
- ['/'] = SCORE_MATCH_SLASH, |
|
| 70 |
- ['-'] = SCORE_MATCH_WORD, |
|
| 71 |
- ['_'] = SCORE_MATCH_WORD, |
|
| 72 |
- [' '] = SCORE_MATCH_WORD, |
|
| 73 |
- ['.'] = SCORE_MATCH_DOT, |
|
| 74 |
- }, |
|
| 75 |
-}; |
|
| 76 |
- |
|
| 77 |
-const size_t bonus_index[256] = {
|
|
| 78 |
- ['A' ... 'Z'] = 2, |
|
| 79 |
- ['a' ... 'z'] = 1, |
|
| 80 |
- ['0' ... '9'] = 1, |
|
| 81 |
-}; |
|
| 82 |
- |
|
| 83 | 59 |
static void precompute_bonus(const char *haystack, score_t *match_bonus) {
|
| 84 | 60 |
/* Which positions are beginning of words */ |
| 85 | 61 |
int m = strlen(haystack); |
| 86 | 62 |
char last_ch = '/'; |
| 87 | 63 |
for (int i = 0; i < m; i++) {
|
| 88 | 64 |
char ch = haystack[i]; |
| 89 |
- match_bonus[i] = bonus_states[bonus_index[(size_t)ch]][(size_t)last_ch]; |
|
| 65 |
+ match_bonus[i] = COMPUTE_BONUS(last_ch, ch); |
|
| 90 | 66 |
last_ch = ch; |
| 91 | 67 |
} |
| 92 | 68 |
} |
| ... | ... |
@@ -55,28 +55,38 @@ void mat_print(score_t *mat, char name, const char *needle, const char *haystack |
| 55 | 55 |
} |
| 56 | 56 |
#endif |
| 57 | 57 |
|
| 58 |
+const score_t bonus_states[][256] = {
|
|
| 59 |
+ { 0 },
|
|
| 60 |
+ {
|
|
| 61 |
+ ['/'] = SCORE_MATCH_SLASH, |
|
| 62 |
+ ['-'] = SCORE_MATCH_WORD, |
|
| 63 |
+ ['_'] = SCORE_MATCH_WORD, |
|
| 64 |
+ [' '] = SCORE_MATCH_WORD, |
|
| 65 |
+ ['.'] = SCORE_MATCH_DOT, |
|
| 66 |
+ }, |
|
| 67 |
+ {
|
|
| 68 |
+ ['a' ... 'z'] = SCORE_MATCH_CAPITAL, |
|
| 69 |
+ ['/'] = SCORE_MATCH_SLASH, |
|
| 70 |
+ ['-'] = SCORE_MATCH_WORD, |
|
| 71 |
+ ['_'] = SCORE_MATCH_WORD, |
|
| 72 |
+ [' '] = SCORE_MATCH_WORD, |
|
| 73 |
+ ['.'] = SCORE_MATCH_DOT, |
|
| 74 |
+ }, |
|
| 75 |
+}; |
|
| 76 |
+ |
|
| 77 |
+const size_t bonus_index[256] = {
|
|
| 78 |
+ ['A' ... 'Z'] = 2, |
|
| 79 |
+ ['a' ... 'z'] = 1, |
|
| 80 |
+ ['0' ... '9'] = 1, |
|
| 81 |
+}; |
|
| 82 |
+ |
|
| 58 | 83 |
static void precompute_bonus(const char *haystack, score_t *match_bonus) {
|
| 59 | 84 |
/* Which positions are beginning of words */ |
| 60 | 85 |
int m = strlen(haystack); |
| 61 |
- char last_ch = '\0'; |
|
| 86 |
+ char last_ch = '/'; |
|
| 62 | 87 |
for (int i = 0; i < m; i++) {
|
| 63 | 88 |
char ch = haystack[i]; |
| 64 |
- |
|
| 65 |
- score_t score = 0; |
|
| 66 |
- if (isalnum(ch)) {
|
|
| 67 |
- if (!last_ch || last_ch == '/') {
|
|
| 68 |
- score = SCORE_MATCH_SLASH; |
|
| 69 |
- } else if (last_ch == '-' || last_ch == '_' || last_ch == ' ') {
|
|
| 70 |
- score = SCORE_MATCH_WORD; |
|
| 71 |
- } else if (last_ch >= 'a' && last_ch <= 'z' && ch >= 'A' && ch <= 'Z') {
|
|
| 72 |
- /* CamelCase */ |
|
| 73 |
- score = SCORE_MATCH_CAPITAL; |
|
| 74 |
- } else if (last_ch == '.') {
|
|
| 75 |
- score = SCORE_MATCH_DOT; |
|
| 76 |
- } |
|
| 77 |
- } |
|
| 78 |
- |
|
| 79 |
- match_bonus[i] = score; |
|
| 89 |
+ match_bonus[i] = bonus_states[bonus_index[(size_t)ch]][(size_t)last_ch]; |
|
| 80 | 90 |
last_ch = ch; |
| 81 | 91 |
} |
| 82 | 92 |
} |
This made sense on paper when deciding what was a "word". However in
reality this is rarely an indication of a separate word. I've found that
this caused hexadecimal or base64 strings to be favoured in matches.
| ... | ... |
@@ -66,8 +66,7 @@ static void precompute_bonus(const char *haystack, score_t *match_bonus) {
|
| 66 | 66 |
if (isalnum(ch)) {
|
| 67 | 67 |
if (!last_ch || last_ch == '/') {
|
| 68 | 68 |
score = SCORE_MATCH_SLASH; |
| 69 |
- } else if (last_ch == '-' || last_ch == '_' || last_ch == ' ' || |
|
| 70 |
- (last_ch >= '0' && last_ch <= '9')) {
|
|
| 69 |
+ } else if (last_ch == '-' || last_ch == '_' || last_ch == ' ') {
|
|
| 71 | 70 |
score = SCORE_MATCH_WORD; |
| 72 | 71 |
} else if (last_ch >= 'a' && last_ch <= 'z' && ch >= 'A' && ch <= 'Z') {
|
| 73 | 72 |
/* CamelCase */ |
| ... | ... |
@@ -55,6 +55,33 @@ void mat_print(score_t *mat, char name, const char *needle, const char *haystack |
| 55 | 55 |
} |
| 56 | 56 |
#endif |
| 57 | 57 |
|
| 58 |
+static void precompute_bonus(const char *haystack, score_t *match_bonus) {
|
|
| 59 |
+ /* Which positions are beginning of words */ |
|
| 60 |
+ int m = strlen(haystack); |
|
| 61 |
+ char last_ch = '\0'; |
|
| 62 |
+ for (int i = 0; i < m; i++) {
|
|
| 63 |
+ char ch = haystack[i]; |
|
| 64 |
+ |
|
| 65 |
+ score_t score = 0; |
|
| 66 |
+ if (isalnum(ch)) {
|
|
| 67 |
+ if (!last_ch || last_ch == '/') {
|
|
| 68 |
+ score = SCORE_MATCH_SLASH; |
|
| 69 |
+ } else if (last_ch == '-' || last_ch == '_' || last_ch == ' ' || |
|
| 70 |
+ (last_ch >= '0' && last_ch <= '9')) {
|
|
| 71 |
+ score = SCORE_MATCH_WORD; |
|
| 72 |
+ } else if (last_ch >= 'a' && last_ch <= 'z' && ch >= 'A' && ch <= 'Z') {
|
|
| 73 |
+ /* CamelCase */ |
|
| 74 |
+ score = SCORE_MATCH_CAPITAL; |
|
| 75 |
+ } else if (last_ch == '.') {
|
|
| 76 |
+ score = SCORE_MATCH_DOT; |
|
| 77 |
+ } |
|
| 78 |
+ } |
|
| 79 |
+ |
|
| 80 |
+ match_bonus[i] = score; |
|
| 81 |
+ last_ch = ch; |
|
| 82 |
+ } |
|
| 83 |
+} |
|
| 84 |
+ |
|
| 58 | 85 |
score_t match_positions(const char *needle, const char *haystack, size_t *positions) {
|
| 59 | 86 |
if (!*needle) |
| 60 | 87 |
return SCORE_MIN; |
| ... | ... |
@@ -89,30 +116,7 @@ score_t match_positions(const char *needle, const char *haystack, size_t *positi |
| 89 | 116 |
* D[][] Stores the best score for this position ending with a match. |
| 90 | 117 |
* M[][] Stores the best possible score at this position. |
| 91 | 118 |
*/ |
| 92 |
- |
|
| 93 |
- /* Which positions are beginning of words */ |
|
| 94 |
- char last_ch = '\0'; |
|
| 95 |
- for (int i = 0; i < m; i++) {
|
|
| 96 |
- char ch = haystack[i]; |
|
| 97 |
- |
|
| 98 |
- score_t score = 0; |
|
| 99 |
- if (isalnum(ch)) {
|
|
| 100 |
- if (!last_ch || last_ch == '/') {
|
|
| 101 |
- score = SCORE_MATCH_SLASH; |
|
| 102 |
- } else if (last_ch == '-' || last_ch == '_' || last_ch == ' ' || |
|
| 103 |
- (last_ch >= '0' && last_ch <= '9')) {
|
|
| 104 |
- score = SCORE_MATCH_WORD; |
|
| 105 |
- } else if (last_ch >= 'a' && last_ch <= 'z' && ch >= 'A' && ch <= 'Z') {
|
|
| 106 |
- /* CamelCase */ |
|
| 107 |
- score = SCORE_MATCH_CAPITAL; |
|
| 108 |
- } else if (last_ch == '.') {
|
|
| 109 |
- score = SCORE_MATCH_DOT; |
|
| 110 |
- } |
|
| 111 |
- } |
|
| 112 |
- |
|
| 113 |
- match_bonus[i] = score; |
|
| 114 |
- last_ch = ch; |
|
| 115 |
- } |
|
| 119 |
+ precompute_bonus(haystack, match_bonus); |
|
| 116 | 120 |
|
| 117 | 121 |
for (int i = 0; i < n; i++) {
|
| 118 | 122 |
score_t prev_score = SCORE_MIN; |
This is marginally faster and I still think it reads very well (maybe
better).
| ... | ... |
@@ -117,9 +117,10 @@ score_t match_positions(const char *needle, const char *haystack, size_t *positi |
| 117 | 117 |
for (int i = 0; i < n; i++) {
|
| 118 | 118 |
score_t prev_score = SCORE_MIN; |
| 119 | 119 |
score_t gap_score = i == n - 1 ? SCORE_GAP_TRAILING : SCORE_GAP_INNER; |
| 120 |
+ |
|
| 120 | 121 |
for (int j = 0; j < m; j++) {
|
| 121 |
- score_t score = SCORE_MIN; |
|
| 122 | 122 |
if (tolower(needle[i]) == tolower(haystack[j])) {
|
| 123 |
+ score_t score = SCORE_MIN; |
|
| 123 | 124 |
if (!i) {
|
| 124 | 125 |
score = (j * SCORE_GAP_LEADING) + match_bonus[j]; |
| 125 | 126 |
} else if (j) { /* i > 0 && j > 0*/
|
| ... | ... |
@@ -129,9 +130,12 @@ score_t match_positions(const char *needle, const char *haystack, size_t *positi |
| 129 | 130 |
/* consecutive match, doesn't stack with match_bonus */ |
| 130 | 131 |
D[i - 1][j - 1] + SCORE_MATCH_CONSECUTIVE); |
| 131 | 132 |
} |
| 133 |
+ D[i][j] = score; |
|
| 134 |
+ M[i][j] = prev_score = max(score, prev_score + gap_score); |
|
| 135 |
+ } else {
|
|
| 136 |
+ D[i][j] = SCORE_MIN; |
|
| 137 |
+ M[i][j] = prev_score = prev_score + gap_score; |
|
| 132 | 138 |
} |
| 133 |
- D[i][j] = score; |
|
| 134 |
- M[i][j] = prev_score = max(score, prev_score + gap_score); |
|
| 135 | 139 |
} |
| 136 | 140 |
} |
| 137 | 141 |
|
| ... | ... |
@@ -55,7 +55,7 @@ void mat_print(score_t *mat, char name, const char *needle, const char *haystack |
| 55 | 55 |
} |
| 56 | 56 |
#endif |
| 57 | 57 |
|
| 58 |
-score_t calculate_score(const char *needle, const char *haystack, size_t *positions) {
|
|
| 58 |
+score_t match_positions(const char *needle, const char *haystack, size_t *positions) {
|
|
| 59 | 59 |
if (!*needle) |
| 60 | 60 |
return SCORE_MIN; |
| 61 | 61 |
|
| ... | ... |
@@ -174,10 +174,6 @@ score_t calculate_score(const char *needle, const char *haystack, size_t *positi |
| 174 | 174 |
return M[n - 1][m - 1]; |
| 175 | 175 |
} |
| 176 | 176 |
|
| 177 |
-score_t match_positions(const char *needle, const char *haystack, size_t *positions) {
|
|
| 178 |
- return calculate_score(needle, haystack, positions); |
|
| 179 |
-} |
|
| 180 |
- |
|
| 181 | 177 |
score_t match(const char *needle, const char *haystack) {
|
| 182 | 178 |
return match_positions(needle, haystack, NULL); |
| 183 | 179 |
} |
| ... | ... |
@@ -56,12 +56,23 @@ void mat_print(score_t *mat, char name, const char *needle, const char *haystack |
| 56 | 56 |
#endif |
| 57 | 57 |
|
| 58 | 58 |
score_t calculate_score(const char *needle, const char *haystack, size_t *positions) {
|
| 59 |
- if (!*haystack || !*needle) |
|
| 59 |
+ if (!*needle) |
|
| 60 | 60 |
return SCORE_MIN; |
| 61 | 61 |
|
| 62 | 62 |
int n = strlen(needle); |
| 63 | 63 |
int m = strlen(haystack); |
| 64 | 64 |
|
| 65 |
+ if (n == m) {
|
|
| 66 |
+ /* Since this method can only be called with a haystack which |
|
| 67 |
+ * matches needle. If the lengths of the strings are equal the |
|
| 68 |
+ * strings themselves must also be equal (ignoring case). |
|
| 69 |
+ */ |
|
| 70 |
+ if (positions) |
|
| 71 |
+ for (int i = 0; i < n; i++) |
|
| 72 |
+ positions[i] = i; |
|
| 73 |
+ return SCORE_MAX; |
|
| 74 |
+ } |
|
| 75 |
+ |
|
| 65 | 76 |
if (m > 1024) {
|
| 66 | 77 |
/* |
| 67 | 78 |
* Unreasonably large candidate: return no score |
| ... | ... |
@@ -164,18 +175,7 @@ score_t calculate_score(const char *needle, const char *haystack, size_t *positi |
| 164 | 175 |
} |
| 165 | 176 |
|
| 166 | 177 |
score_t match_positions(const char *needle, const char *haystack, size_t *positions) {
|
| 167 |
- if (!*needle) {
|
|
| 168 |
- return SCORE_MAX; |
|
| 169 |
- } else if (!strcasecmp(needle, haystack)) {
|
|
| 170 |
- if (positions) {
|
|
| 171 |
- int n = strlen(needle); |
|
| 172 |
- for (int i = 0; i < n; i++) |
|
| 173 |
- positions[i] = i; |
|
| 174 |
- } |
|
| 175 |
- return SCORE_MAX; |
|
| 176 |
- } else {
|
|
| 177 |
- return calculate_score(needle, haystack, positions); |
|
| 178 |
- } |
|
| 178 |
+ return calculate_score(needle, haystack, positions); |
|
| 179 | 179 |
} |
| 180 | 180 |
|
| 181 | 181 |
score_t match(const char *needle, const char *haystack) {
|
| 1 | 1 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,183 @@ |
| 1 |
+#include <ctype.h> |
|
| 2 |
+#include <string.h> |
|
| 3 |
+#include <strings.h> |
|
| 4 |
+#include <stdio.h> |
|
| 5 |
+#include <float.h> |
|
| 6 |
+#include <math.h> |
|
| 7 |
+ |
|
| 8 |
+#include "match.h" |
|
| 9 |
+ |
|
| 10 |
+#include "../config.h" |
|
| 11 |
+ |
|
| 12 |
+char *strcasechr(const char *s, char c) {
|
|
| 13 |
+ const char accept[3] = {c, toupper(c), 0};
|
|
| 14 |
+ return strpbrk(s, accept); |
|
| 15 |
+} |
|
| 16 |
+ |
|
| 17 |
+int has_match(const char *needle, const char *haystack) {
|
|
| 18 |
+ while (*needle) {
|
|
| 19 |
+ char nch = *needle++; |
|
| 20 |
+ |
|
| 21 |
+ if (!(haystack = strcasechr(haystack, nch))) {
|
|
| 22 |
+ return 0; |
|
| 23 |
+ } |
|
| 24 |
+ haystack++; |
|
| 25 |
+ } |
|
| 26 |
+ return 1; |
|
| 27 |
+} |
|
| 28 |
+ |
|
| 29 |
+#define max(a, b) (((a) > (b)) ? (a) : (b)) |
|
| 30 |
+ |
|
| 31 |
+#ifdef DEBUG_VERBOSE |
|
| 32 |
+/* print one of the internal matrices */ |
|
| 33 |
+void mat_print(score_t *mat, char name, const char *needle, const char *haystack) {
|
|
| 34 |
+ int n = strlen(needle); |
|
| 35 |
+ int m = strlen(haystack); |
|
| 36 |
+ int i, j; |
|
| 37 |
+ fprintf(stderr, "%c ", name); |
|
| 38 |
+ for (j = 0; j < m; j++) {
|
|
| 39 |
+ fprintf(stderr, " %c", haystack[j]); |
|
| 40 |
+ } |
|
| 41 |
+ fprintf(stderr, "\n"); |
|
| 42 |
+ for (i = 0; i < n; i++) {
|
|
| 43 |
+ fprintf(stderr, " %c |", needle[i]); |
|
| 44 |
+ for (j = 0; j < m; j++) {
|
|
| 45 |
+ score_t val = mat[i * m + j]; |
|
| 46 |
+ if (val == SCORE_MIN) {
|
|
| 47 |
+ fprintf(stderr, " -\u221E"); |
|
| 48 |
+ } else {
|
|
| 49 |
+ fprintf(stderr, " %.3f", val); |
|
| 50 |
+ } |
|
| 51 |
+ } |
|
| 52 |
+ fprintf(stderr, "\n"); |
|
| 53 |
+ } |
|
| 54 |
+ fprintf(stderr, "\n\n"); |
|
| 55 |
+} |
|
| 56 |
+#endif |
|
| 57 |
+ |
|
| 58 |
+score_t calculate_score(const char *needle, const char *haystack, size_t *positions) {
|
|
| 59 |
+ if (!*haystack || !*needle) |
|
| 60 |
+ return SCORE_MIN; |
|
| 61 |
+ |
|
| 62 |
+ int n = strlen(needle); |
|
| 63 |
+ int m = strlen(haystack); |
|
| 64 |
+ |
|
| 65 |
+ if (m > 1024) {
|
|
| 66 |
+ /* |
|
| 67 |
+ * Unreasonably large candidate: return no score |
|
| 68 |
+ * If it is a valid match it will still be returned, it will |
|
| 69 |
+ * just be ranked below any reasonably sized candidates |
|
| 70 |
+ */ |
|
| 71 |
+ return SCORE_MIN; |
|
| 72 |
+ } |
|
| 73 |
+ |
|
| 74 |
+ score_t match_bonus[m]; |
|
| 75 |
+ score_t D[n][m], M[n][m]; |
|
| 76 |
+ |
|
| 77 |
+ /* |
|
| 78 |
+ * D[][] Stores the best score for this position ending with a match. |
|
| 79 |
+ * M[][] Stores the best possible score at this position. |
|
| 80 |
+ */ |
|
| 81 |
+ |
|
| 82 |
+ /* Which positions are beginning of words */ |
|
| 83 |
+ char last_ch = '\0'; |
|
| 84 |
+ for (int i = 0; i < m; i++) {
|
|
| 85 |
+ char ch = haystack[i]; |
|
| 86 |
+ |
|
| 87 |
+ score_t score = 0; |
|
| 88 |
+ if (isalnum(ch)) {
|
|
| 89 |
+ if (!last_ch || last_ch == '/') {
|
|
| 90 |
+ score = SCORE_MATCH_SLASH; |
|
| 91 |
+ } else if (last_ch == '-' || last_ch == '_' || last_ch == ' ' || |
|
| 92 |
+ (last_ch >= '0' && last_ch <= '9')) {
|
|
| 93 |
+ score = SCORE_MATCH_WORD; |
|
| 94 |
+ } else if (last_ch >= 'a' && last_ch <= 'z' && ch >= 'A' && ch <= 'Z') {
|
|
| 95 |
+ /* CamelCase */ |
|
| 96 |
+ score = SCORE_MATCH_CAPITAL; |
|
| 97 |
+ } else if (last_ch == '.') {
|
|
| 98 |
+ score = SCORE_MATCH_DOT; |
|
| 99 |
+ } |
|
| 100 |
+ } |
|
| 101 |
+ |
|
| 102 |
+ match_bonus[i] = score; |
|
| 103 |
+ last_ch = ch; |
|
| 104 |
+ } |
|
| 105 |
+ |
|
| 106 |
+ for (int i = 0; i < n; i++) {
|
|
| 107 |
+ score_t prev_score = SCORE_MIN; |
|
| 108 |
+ score_t gap_score = i == n - 1 ? SCORE_GAP_TRAILING : SCORE_GAP_INNER; |
|
| 109 |
+ for (int j = 0; j < m; j++) {
|
|
| 110 |
+ score_t score = SCORE_MIN; |
|
| 111 |
+ if (tolower(needle[i]) == tolower(haystack[j])) {
|
|
| 112 |
+ if (!i) {
|
|
| 113 |
+ score = (j * SCORE_GAP_LEADING) + match_bonus[j]; |
|
| 114 |
+ } else if (j) { /* i > 0 && j > 0*/
|
|
| 115 |
+ score = max( |
|
| 116 |
+ M[i - 1][j - 1] + match_bonus[j], |
|
| 117 |
+ |
|
| 118 |
+ /* consecutive match, doesn't stack with match_bonus */ |
|
| 119 |
+ D[i - 1][j - 1] + SCORE_MATCH_CONSECUTIVE); |
|
| 120 |
+ } |
|
| 121 |
+ } |
|
| 122 |
+ D[i][j] = score; |
|
| 123 |
+ M[i][j] = prev_score = max(score, prev_score + gap_score); |
|
| 124 |
+ } |
|
| 125 |
+ } |
|
| 126 |
+ |
|
| 127 |
+#ifdef DEBUG_VERBOSE |
|
| 128 |
+ fprintf(stderr, "\"%s\" =~ \"%s\"\n", needle, haystack); |
|
| 129 |
+ mat_print(&D[0][0], 'D', needle, haystack); |
|
| 130 |
+ mat_print(&M[0][0], 'M', needle, haystack); |
|
| 131 |
+ fprintf(stderr, "\n"); |
|
| 132 |
+#endif |
|
| 133 |
+ |
|
| 134 |
+ /* backtrace to find the positions of optimal matching */ |
|
| 135 |
+ if (positions) {
|
|
| 136 |
+ int match_required = 0; |
|
| 137 |
+ for (int i = n - 1, j = m - 1; i >= 0; i--) {
|
|
| 138 |
+ for (; j >= 0; j--) {
|
|
| 139 |
+ /* |
|
| 140 |
+ * There may be multiple paths which result in |
|
| 141 |
+ * the optimal weight. |
|
| 142 |
+ * |
|
| 143 |
+ * For simplicity, we will pick the first one |
|
| 144 |
+ * we encounter, the latest in the candidate |
|
| 145 |
+ * string. |
|
| 146 |
+ */ |
|
| 147 |
+ if (D[i][j] != SCORE_MIN && |
|
| 148 |
+ (match_required || D[i][j] == M[i][j])) {
|
|
| 149 |
+ /* If this score was determined using |
|
| 150 |
+ * SCORE_MATCH_CONSECUTIVE, the |
|
| 151 |
+ * previous character MUST be a match |
|
| 152 |
+ */ |
|
| 153 |
+ match_required = |
|
| 154 |
+ i && j && |
|
| 155 |
+ M[i][j] == D[i - 1][j - 1] + SCORE_MATCH_CONSECUTIVE; |
|
| 156 |
+ positions[i] = j--; |
|
| 157 |
+ break; |
|
| 158 |
+ } |
|
| 159 |
+ } |
|
| 160 |
+ } |
|
| 161 |
+ } |
|
| 162 |
+ |
|
| 163 |
+ return M[n - 1][m - 1]; |
|
| 164 |
+} |
|
| 165 |
+ |
|
| 166 |
+score_t match_positions(const char *needle, const char *haystack, size_t *positions) {
|
|
| 167 |
+ if (!*needle) {
|
|
| 168 |
+ return SCORE_MAX; |
|
| 169 |
+ } else if (!strcasecmp(needle, haystack)) {
|
|
| 170 |
+ if (positions) {
|
|
| 171 |
+ int n = strlen(needle); |
|
| 172 |
+ for (int i = 0; i < n; i++) |
|
| 173 |
+ positions[i] = i; |
|
| 174 |
+ } |
|
| 175 |
+ return SCORE_MAX; |
|
| 176 |
+ } else {
|
|
| 177 |
+ return calculate_score(needle, haystack, positions); |
|
| 178 |
+ } |
|
| 179 |
+} |
|
| 180 |
+ |
|
| 181 |
+score_t match(const char *needle, const char *haystack) {
|
|
| 182 |
+ return match_positions(needle, haystack, NULL); |
|
| 183 |
+} |