Skip to content

Commit 1bd5d7b

Browse files
author
tmikolov
committed
demo code for comupting analogies with the word vectors
1 parent efc5e10 commit 1bd5d7b

1 file changed

Lines changed: 138 additions & 0 deletions

File tree

word-analogy.c

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
// Copyright 2013 Google Inc. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#include <stdio.h>
16+
#include <string.h>
17+
#include <math.h>
18+
#include <malloc.h>
19+
20+
const long long max_size = 2000; // max length of strings
21+
const long long N = 40; // number of closest words that will be shown
22+
const long long max_w = 50; // max length of vocabulary entries
23+
24+
int main(int argc, char **argv) {
25+
FILE *f;
26+
char st1[max_size];
27+
char bestw[N][max_size];
28+
char file_name[max_size], st[100][max_size];
29+
float dist, len, bestd[N], vec[max_size];
30+
long long words, size, a, b, c, d, cn, bi[100];
31+
char ch;
32+
float *M;
33+
char *vocab;
34+
if (argc < 2) {
35+
printf("Usage: ./word-analogy <FILE>\nwhere FILE contains word projections in the BINARY FORMAT\n");
36+
return 0;
37+
}
38+
strcpy(file_name, argv[1]);
39+
f = fopen(file_name, "rb");
40+
if (f == NULL) {
41+
printf("Input file not found\n");
42+
return -1;
43+
}
44+
fscanf(f, "%lld", &words);
45+
fscanf(f, "%lld", &size);
46+
vocab = (char *)malloc((long long)words * max_w * sizeof(char));
47+
M = (float *)malloc((long long)words * (long long)size * sizeof(float));
48+
if (M == NULL) {
49+
printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
50+
return -1;
51+
}
52+
for (b = 0; b < words; b++) {
53+
fscanf(f, "%s%c", &vocab[b * max_w], &ch);
54+
for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
55+
len = 0;
56+
for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
57+
len = sqrt(len);
58+
for (a = 0; a < size; a++) M[a + b * size] /= len;
59+
}
60+
fclose(f);
61+
while (1) {
62+
for (a = 0; a < N; a++) bestd[a] = 0;
63+
for (a = 0; a < N; a++) bestw[a][0] = 0;
64+
printf("Enter three words (EXIT to break): ");
65+
a = 0;
66+
while (1) {
67+
st1[a] = fgetc(stdin);
68+
if ((st1[a] == '\n') || (a >= max_size - 1)) {
69+
st1[a] = 0;
70+
break;
71+
}
72+
a++;
73+
}
74+
if (!strcmp(st1, "EXIT")) break;
75+
cn = 0;
76+
b = 0;
77+
c = 0;
78+
while (1) {
79+
st[cn][b] = st1[c];
80+
b++;
81+
c++;
82+
st[cn][b] = 0;
83+
if (st1[c] == 0) break;
84+
if (st1[c] == ' ') {
85+
cn++;
86+
b = 0;
87+
c++;
88+
}
89+
}
90+
cn++;
91+
if (cn < 3) {
92+
printf("Only %lld words were entered.. three words are needed at the input to perform the calculation\n", cn);
93+
continue;
94+
}
95+
for (a = 0; a < cn; a++) {
96+
for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break;
97+
if (b == words) b = 0;
98+
bi[a] = b;
99+
printf("\nWord: %s Position in vocabulary: %lld\n", st[a], bi[a]);
100+
if (b == 0) {
101+
printf("Out of dictionary word!\n");
102+
break;
103+
}
104+
}
105+
if (b == 0) continue;
106+
printf("\n Word Distance\n------------------------------------------------------------------------\n");
107+
for (a = 0; a < size; a++) vec[a] = M[a + bi[1] * size] - M[a + bi[0] * size] + M[a + bi[2] * size];
108+
len = 0;
109+
for (a = 0; a < size; a++) len += vec[a] * vec[a];
110+
len = sqrt(len);
111+
for (a = 0; a < size; a++) vec[a] /= len;
112+
for (a = 0; a < N; a++) bestd[a] = 0;
113+
for (a = 0; a < N; a++) bestw[a][0] = 0;
114+
for (c = 0; c < words; c++) {
115+
if (c == bi[0]) continue;
116+
if (c == bi[1]) continue;
117+
if (c == bi[2]) continue;
118+
a = 0;
119+
for (b = 0; b < cn; b++) if (bi[b] == c) a = 1;
120+
if (a == 1) continue;
121+
dist = 0;
122+
for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size];
123+
for (a = 0; a < N; a++) {
124+
if (dist > bestd[a]) {
125+
for (d = N - 1; d > a; d--) {
126+
bestd[d] = bestd[d - 1];
127+
strcpy(bestw[d], bestw[d - 1]);
128+
}
129+
bestd[a] = dist;
130+
strcpy(bestw[a], &vocab[c * max_w]);
131+
break;
132+
}
133+
}
134+
}
135+
for (a = 0; a < N; a++) printf("%50s\t\t%f\n", bestw[a], bestd[a]);
136+
}
137+
return 0;
138+
}

0 commit comments

Comments
 (0)