Browse code

Improve scoring

John Hawthorn authored on 30/07/2014 04:28:32
Showing 2 changed files

... ...
@@ -35,6 +35,15 @@ int test_scoring(){
35 35
 	/* App/MOdels/foo is better than App/M/fOo  */
36 36
 	assert(match("amo", "app/m/foo") < match("amo", "app/models/foo"));
37 37
 
38
+	/* GEMFIle.Lock < GEMFILe  */
39
+	assert(match("gemfil", "Gemfile.lock") < match("gemfil", "Gemfile"));
40
+
41
+	/* GEMFIle.Lock < GEMFILe  */
42
+	assert(match("gemfil", "Gemfile.lock") < match("gemfil", "Gemfile"));
43
+
44
+	/* Prefer shorter matches */
45
+	assert(match("test", "tests") > match("test", "testing"));
46
+
38 47
 	return 0;
39 48
 }
40 49
 
... ...
@@ -3,6 +3,7 @@
3 3
 #include <strings.h>
4 4
 #include <stdio.h>
5 5
 #include <float.h>
6
+#include <math.h>
6 7
 
7 8
 #include "fzy.h"
8 9
 
... ...
@@ -18,8 +19,8 @@ int has_match(const char *needle, const char *haystack){
18 19
 
19 20
 #define max(a, b) (((a) > (b)) ? (a) : (b))
20 21
 typedef double score_t;
21
-#define SCORE_MAX DBL_MAX
22
-#define SCORE_MIN -DBL_MAX
22
+#define SCORE_MAX INFINITY
23
+#define SCORE_MIN -INFINITY
23 24
 
24 25
 /* print one of the internal matrices */
25 26
 void mat_print(score_t *mat, int n, int m){
... ...
@@ -65,34 +66,52 @@ double calculate_score(const char *needle, const char *haystack, size_t *positio
65 66
 	 */
66 67
 
67 68
 	/* Which positions are beginning of words */
68
-	int at_bow = 1;
69 69
 	char last_ch = '\0';
70 70
 	for(int i = 0; i < m; i++){
71 71
 		char ch = haystack[i];
72
-		/* TODO: What about allcaps (ex. README) */
73
-		int bow = (at_bow && isalnum(ch)) || (isupper(ch) && !isupper(last_ch));
74
-		at_bow = !isalnum(ch);
75
-		last_ch = ch;
76 72
 
77
-		match_bonus[i] = bow ? 1.5 : 0;
73
+		score_t score = 0;
74
+		if(isalnum(ch)){
75
+			if(last_ch == '/'){
76
+				score = 1.5;
77
+			}else if(last_ch == '-' ||
78
+					last_ch == '_' ||
79
+					last_ch == ' ' ||
80
+					(last_ch >= '0' && last_ch <= '9')){
81
+				score = 1.2;
82
+			}else if(last_ch >= 'a' && last_ch <= 'z' &&
83
+					ch >= 'A' && ch <= 'Z'){
84
+				/* CamelCase */
85
+				score = 1.1;
86
+			}else if(last_ch == '.'){
87
+				score = 0.8;
88
+			}
89
+		}
90
+
91
+		match_bonus[i] = score;
92
+		last_ch = ch;
78 93
 	}
79 94
 
80 95
 	for(int i = 0; i < n; i++){
81 96
 		for(int j = 0; j < m; j++){
82
-			D[i][j] = SCORE_MIN;
97
+			score_t score = j ? SCORE_MIN : 0;
83 98
 			int match = tolower(needle[i]) == tolower(haystack[j]);
99
+			D[i][j] = SCORE_MIN;
84 100
 			if(match){
85
-				score_t score = 0;
86 101
 				if(i && j){
87
-					score = M[i-1][j-1] + match_bonus[j];
102
+					score = max(score, M[i-1][j-1] + match_bonus[j]);
88 103
 
89 104
 					/* consecutive match, doesn't stack with match_bonus */
90
-					score = max(score, 1 + D[i-1][j-1]);
105
+					score = max(score, D[i-1][j-1] + 1.0);
106
+				}else if(!i){
107
+					score = (j * -0.01) + match_bonus[j];
91 108
 				}
92
-				M[i][j] = D[i][j] = score;
109
+				D[i][j] = score;
110
+			}
111
+			if(j){
112
+				score = max(score, M[i][j-1] - 0.01);
93 113
 			}
94
-			if(j)
95
-				M[i][j] = max(M[i][j], M[i][j-1] - 0.05);
114
+			M[i][j] = score;
96 115
 		}
97 116
 	}
98 117
 
... ...
@@ -113,8 +132,8 @@ double calculate_score(const char *needle, const char *haystack, size_t *positio
113 132
 				 * we encounter, the latest in the candidate
114 133
 				 * string.
115 134
 				 */
116
-				if(D[i][j] == M[i][j]){
117
-					positions[i] = j;
135
+				if(tolower(needle[i]) == tolower(haystack[j]) && D[i][j] == M[i][j]){
136
+					positions[i] = j--;
118 137
 					break;
119 138
 				}
120 139
 			}