Browse code

New DP algorithm match scoring algorithm

John Hawthorn authored on 13/07/2014 00:45:07
Showing 3 changed files

  • Makefile index 974332c..6253311 100644
  • match.c index 8ef15a1..967607b 100644
  • test.rb index e1e9f0b..849bde2 100644
... ...
@@ -1,4 +1,4 @@
1
-CFLAGS+=-Wall -Wextra -g
1
+CFLAGS+=-Wall -Wextra -g -std=c99
2 2
 
3 3
 all: fzy testscore
4 4
 
... ...
@@ -1,5 +1,7 @@
1 1
 #include <ctype.h>
2 2
 #include <string.h>
3
+#include <strings.h>
4
+#include <stdio.h>
3 5
 
4 6
 static int is_subset(const char *needle, const char *haystack){
5 7
 	while(*needle){
... ...
@@ -11,14 +13,73 @@ static int is_subset(const char *needle, const char *haystack){
11 13
 	return 1;
12 14
 }
13 15
 
16
+/* print one of the internal matrices */
17
+void mat_print(int *mat, int n, int m){
18
+	int i, j;
19
+	for(i = 0; i < n; i++){
20
+		for(j = 0; j < m; j++){
21
+			fprintf(stderr, " %3zd", mat[i*m + j]);
22
+		}
23
+		fprintf(stderr, "\n");
24
+	}
25
+	fprintf(stderr, "\n\n");
26
+}
27
+
28
+#define max(a, b) (((a) > (b)) ? (a) : (b))
29
+typedef int score_t;
30
+
31
+double calculate_score(const char *needle, const char *haystack){
32
+	int n = strlen(needle);
33
+	int m = strlen(haystack);
34
+
35
+	int bow[m];
36
+	int D[n][m], M[n][m];
37
+	bzero(D, sizeof(D));
38
+	bzero(M, sizeof(M));
39
+
40
+	/*
41
+	 * D[][] Stores the best score for this position ending with a match.
42
+	 * M[][] Stores the best possible score at this position.
43
+	 */
44
+
45
+	/* Which positions are beginning of words */
46
+	int at_bow = 1;
47
+	for(int i = 0; i < m; i++){
48
+		char ch = haystack[i];
49
+		/* TODO: What about allcaps (ex. README) */
50
+		bow[i] = (at_bow && isalnum(ch)) || isupper(ch);
51
+		at_bow = !isalnum(ch);
52
+	}
53
+
54
+	for(int i = 0; i < n; i++){
55
+		for(int j = 0; j < m; j++){
56
+			int match = tolower(needle[i]) == tolower(haystack[j]);
57
+			if(match){
58
+				score_t score = 0;
59
+				if(i && j)
60
+					score = M[i-1][j-1];
61
+				if(bow[j])
62
+					score += 2;
63
+				else if(i && j && D[i-1][j-1])
64
+					score = max(score, 1 + D[i-1][j-1]);
65
+				M[i][j] = D[i][j] = score;
66
+			}
67
+			if(j)
68
+				M[i][j] = max(M[i][j], M[i][j-1]);
69
+		}
70
+	}
71
+
72
+	return (float)(M[n-1][m-1]) / (float)(n * 2 + 1);
73
+}
74
+
14 75
 double match(const char *needle, const char *haystack){
15 76
 	if(!*needle){
16 77
 		return 1.0;
17 78
 	}else if(!is_subset(needle, haystack)){
18
-		return 0.0;
79
+		return -1.0;
19 80
 	}else if(!strcasecmp(needle, haystack)){
20 81
 		return 1.0;
21 82
 	}else{
22
-		return 0.9;
83
+		return calculate_score(needle, haystack);
23 84
 	}
24 85
 }
... ...
@@ -8,11 +8,11 @@ describe "score" do
8 8
   end
9 9
 
10 10
   def assert_unmatched(candidate, query)
11
-    assert_equal 0, score(candidate, query)
11
+    assert_equal -1, score(candidate, query)
12 12
   end
13 13
 
14 14
   def assert_matched(candidate, query)
15
-    assert_operator 0, :<, score(candidate, query)
15
+    assert_operator 0, :<=, score(candidate, query)
16 16
   end
17 17
 
18 18
   it "scores 1 when the query is empty" do