word2vec核心代码注释

建议对照word2vec.c看注释,标红部分为中文注释以及相应代码,added by lijiawei

// Licensed under the Apache License, Version 2.0 (the "License");

// you may not use this file except in compliance with the License.

// You may obtain a copy of the License at

// http://www.apache.org/licenses/LICENSE-2.0

// Unless required by applicable law or agreed to in writing, software

// distributed under the License is distributed on an "AS IS" BASIS,

// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

// See the License for the specific language governing permissions and

// limitations under the License.

#define MAX_STRING 100

#define EXP_TABLE_SIZE 1000

#define MAX_EXP 6

#define MAX_SENTENCE_LENGTH 1000

#define MAX_CODE_LENGTH 40

const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary

typedef float real; // Precision of float numbers

//@brief 输入文件中每个基本词的结构体

//cn,该本词出现的数量

//point,哈弗曼树中，从root节点到该基本词的路径，指针数组,存放的是每个父节点在vocabulary中的索引

//word,基本词字面

//code,该基本词的哈弗曼码

//codelen，该基本词的哈弗曼码的长度

struct vocab_word {

long long cn;

int *point;

char *word, *code, codelen;

};

char train_file[MAX_STRING], output_file[MAX_STRING];

char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];

//@brief 输入文件中每个基本词的结构体数组,即paper中的vocabulary

struct vocab_word *vocab;

int binary = 0, cbow = 0, debug_mode = 2, window = 5, min_count = 5, num_threads = 1, min_reduce = 1;

//@brief 该数组存文件中基本词的字面的hash码，和基本词在vocab_word数组中的位置

//其中基本词的字面的hash码作为该数组的下标,

int *vocab_hash;

long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;

long long train_words = 0, word_count_actual = 0, file_size = 0, classes = 0;

real alpha = 0.025, starting_alpha, sample = 0;

//@brief

//syn0即基本词的input feature vector 矩阵，当然cbow是把上下文的input feature vector相加

//syn1 实际上是文章中的Wx的W矩阵,x即input feature vector 矩阵syn0

//syn1neg 同syn1，用于负采样

//expTable: logistic function的exp(x)表，实现是事先计算好，对词表中的每个词在计算P(v|Wt-1, . . . ,Wt-n+1)=p(x)=exp(x)/(1+exp(x))时

//直接查表，即p(x)=exp(x)/(1+exp(x))=exp(Wx)/(1+exp(Wx))=exp(syn0*syn1)/(1+exp(syn0*syn1))

real *syn0, *syn1, *syn1neg, *expTable;

clock_t start;

int hs = 1, negative = 0;

const int table_size = 1e8;

int *table;

void InitUnigramTable() {

int a, i;

long long train_words_pow = 0;

real d1, power = 0.75;

table = (int *)malloc(table_size * sizeof(int));

for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power);

i = 0;

d1 = pow(vocab[i].cn, power) / (real)train_words_pow;

for (a = 0; a < table_size; a++) {

table[a] = i;

if (a / (real)table_size > d1) {

i++;

d1 += pow(vocab[i].cn, power) / (real)train_words_pow;

}

if (i >= vocab_size) i = vocab_size - 1;

}

// Reads a single word from a file, assuming space + tab + EOL to be word boundaries

//@brief 此处纯读字面到word字符串中

//@param word 存字面

//@param fin 输入文件流

void ReadWord(char *word, FILE *fin) {

int a = 0, ch;

while (!feof(fin)) {

ch = fgetc(fin);

if (ch == 13) continue;

if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {

if (a > 0) {

if (ch == '\n') ungetc(ch, fin);

break;

}

if (ch == '\n') {

strcpy(word, (char *)"");

return;

} else continue;

}

word[a] = ch;

a++;

if (a >= MAX_STRING - 1) a--; // Truncate too long words

}

word[a] = 0;

}

// Returns hash value of a word

//@brief 返回每个基本词的hash编码

//hash规则

//hash = hash * 257 + word[a];

//@param word 基本词字面

//@return 每个基本词的hash编码

int GetWordHash(char *word) {

unsigned long long a, hash = 0;

for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];

hash = hash % vocab_hash_size;

return hash;

}

// Returns position of a word in the vocabulary; if the word is not found, returns -1

//@brief 根据word字面的hash查找其在vocab中的位置,即vocab_hash[hash]的值

//@param word 字面

//@return vocab_hash[hash]的值

int SearchVocab(char *word) {

unsigned int hash = GetWordHash(word);

while (1) {

if (vocab_hash[hash] == -1) return -1;

if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];

hash = (hash + 1) % vocab_hash_size;

}

return -1;

}

// Reads a word and returns its index in the vocabulary

int ReadWordIndex(FILE *fin) {

char word[MAX_STRING];

ReadWord(word, fin);

if (feof(fin)) return -1;

return SearchVocab(word);

}

// Adds a word to the vocabulary

int AddWordToVocab(char *word) {

unsigned int hash, length = strlen(word) + 1;

if (length > MAX_STRING) length = MAX_STRING;

vocab[vocab_size].word = (char *)calloc(length, sizeof(char));

strcpy(vocab[vocab_size].word, word);

vocab[vocab_size].cn = 0;

vocab_size++;

// Reallocate memory if needed

if (vocab_size + 2 >= vocab_max_size) {

vocab_max_size += 1000;

vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));

}

hash = GetWordHash(word);

// printf("%d,%s\n",hash,word);

while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;

vocab_hash[hash] = vocab_size - 1;

return vocab_size - 1;

}

// Used later for sorting by word counts

int VocabCompare(const void *a, const void *b) {

return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn;

}

// Sorts the vocabulary by frequency using word counts

//@brief 通过排序把出现数量少的word排在vocab_word数组的前面

void SortVocab() {

int a, size;

unsigned int hash;

// Sort the vocabulary and keep at the first position

qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);

for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;

size = vocab_size;

train_words = 0;

for (a = 0; a < size; a++) {

// Words occuring less than min_count times will be discarded from the vocab

if (vocab[a].cn < min_count) {

vocab_size--;

free(vocab[vocab_size].word);

} else {

// Hash will be re-computed, as after the sorting it is not actual

hash=GetWordHash(vocab[a].word);

while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;

vocab_hash[hash] = a;

train_words += vocab[a].cn;

}

vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));

// Allocate memory for the binary tree construction

for (a = 0; a < vocab_size; a++) {

vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));

vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));

}

// Create binary Huffman tree using the word counts

// Frequent words will have short uniqe binary codes

//@brief 哈弗曼编码，

//流程是先在所有的vacabulary中找2个最小weight的节点作为叶子节点，weight即基本词出现的次数cn,合并成一个父亲节点，从词序列中剔除这两个节点，并加入合成的父亲节点，重复上述过程，建成一颗哈弗曼树，最长路径log|V最小的二叉树

void CreateBinaryTree() {

long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];

char code[MAX_CODE_LENGTH];

long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));

long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));

long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));

for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn;

for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15;

pos1 = vocab_size - 1;

pos2 = vocab_size;

// Following algorithm constructs the Huffman tree by adding one node at a time

for (a = 0; a < vocab_size - 1; a++) {

// First, find two smallest nodes 'min1, min2'

if (pos1 >= 0) {

if (count[pos1] < count[pos2]) {

min1i = pos1;

pos1--;

} else {

min1i = pos2;

pos2++;

}

} else {

min1i = pos2;

pos2++;

}

if (pos1 >= 0) {

if (count[pos1] < count[pos2]) {

min2i = pos1;

pos1--;

} else {

min2i = pos2;

pos2++;

}

} else {

min2i = pos2;

pos2++;

}

count[vocab_size + a] = count[min1i] + count[min2i];

parent_node[min1i] = vocab_size + a;

parent_node[min2i] = vocab_size + a;

binary[min2i] = 1;

}

// Now assign binary code to each vocabulary word

for (a = 0; a < vocab_size; a++) {

b = a;

i = 0;

while (1) {

code[i] = binary[b];

// printf("%lld",binary[b]);

point[i] = b;

printf("%d\n",b);

i++;

b = parent_node[b];

// printf("\n");

if (b == vocab_size * 2 - 2) break;

}

//printf("%d\n",i);

vocab[a].codelen = i;

//printf("%lld\n",i);

vocab[a].point[0] = vocab_size - 2;

//@brief 下面存放每个基本词的路径，注意i - b - 1是距离叶子节点最近的父节点的编码

for (b = 0; b < i; b++) {

vocab[a].code[i - b - 1] = code[b];

vocab[a].point[i - b] = point[b] - vocab_size;

// printf("%lld\n",vocab[a].point[i - b]);

//printf("%c\n",code[b]);

}

free(count);

free(binary);

free(parent_node);

}

void InitNet() {

long long a, b;

a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real));

if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);}

if (hs) {

a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real));

if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);}

for (b = 0; b < layer1_size; b++) for (a = 0; a < vocab_size; a++)

syn1[a * layer1_size + b] = 0;

}

if (negative>0) {

a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * layer1_size * sizeof(real));

if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);}

for (b = 0; b < layer1_size; b++) for (a = 0; a < vocab_size; a++)

syn1neg[a * layer1_size + b] = 0;

}

for (b = 0; b < layer1_size; b++) for (a = 0; a < vocab_size; a++)

syn0[a * layer1_size + b] = (rand() / (real)RAND_MAX - 0.5) / layer1_size;

CreateBinaryTree();

}

void *TrainModelThread(void *id) {

long long a, b, d, word, last_word, sentence_length = 0, sentence_position = 0;

//@brief 此处注意sen数组存放的是当前训练样本，即基本词的上下文在vocab中的位置

long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];

long long l1, l2, c, target, label;

unsigned long long next_random = (long long)id;

real f, g;

clock_t now;

real *neu1 = (real *)calloc(layer1_size, sizeof(real));

real *neu1e = (real *)calloc(layer1_size, sizeof(real));

FILE *fi = fopen(train_file, "rb");

fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);

while (1) {

if (word_count - last_word_count > 10000) {

word_count_actual += word_count - last_word_count;

last_word_count = word_count;

if ((debug_mode > 1)) {

now=clock();

printf("�lpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha,

word_count_actual / (real)(train_words + 1) * 100,

word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));

fflush(stdout);

}

alpha = starting_alpha * (1 - word_count_actual / (real)(train_words + 1));

if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001;

}

if (sentence_length == 0) {

while (1) {

word = ReadWordIndex(fi);

//printf("%d\n",word);

if (feof(fi)) break;

if (word == -1) continue;

word_count++;

if (word == 0) break;

// The subsampling randomly discards frequent words while keeping the ranking same

if (sample > 0) {

real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn;

next_random = next_random * (unsigned long long)25214903917 + 11;

if (ran < (next_random & 0xFFFF) / (real)65536) continue;

}

sen[sentence_length] = word;

sentence_length++;

if (sentence_length >= MAX_SENTENCE_LENGTH) break;

}

sentence_position = 0;

}

if (feof(fi)) break;

if (word_count > train_words / num_threads) break;

word = sen[sentence_position];

//printf("%d\n",sentence_position);

if (word == -1) continue;

for (c = 0; c < layer1_size; c++) neu1[c] = 0;

for (c = 0; c < layer1_size; c++) neu1e[c] = 0;

next_random = next_random * (unsigned long long)25214903917 + 11;

b = next_random % window;

if (cbow) { //train the cbow architecture

// in -> hidden

for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {

c = sentence_position - window + a;

if (c < 0) continue;

if (c >= sentence_length) continue;

//@brief last_word等于上下文在vocabulary中的索引

last_word = sen[c];

if (last_word == -1) continue;

//@brief 此处注意last_word,即上下文的input feature vector在input feature vector矩阵syn0中的索引

//@param 计算该nn的输入vector即x=neu1

for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size];

}

//@brief 该循环vocab[word].codelen,即遍历当前训练的instance在哈夫曼树中到root节点路径上的所有父节点

if (hs) for (d = 0; d < vocab[word].codelen; d++) {

f = 0;

//@brief 此处注意l2即为每个父节点的索引，对应到模型中即Wx,中的W[parent]向量

l2 = vocab[word].point[d] * layer1_size;

// Propagate hidden -> output

//@brief 进行Wx计算

for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2];

if (f <= -MAX_EXP) continue;

else if (f >= MAX_EXP) continue;

//@brief 进行exp(Wx)查表，即p(x)=f=exp(x)/(1+exp(x))

else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];

//@brief此处最核心，loss是交叉熵

//Loss=xlogp(x)+（1-x）*log(1-p（x）) =

//其中p(x)=exp(neu1[c] * syn1[c + l2])/(1+exp(neu1[c] * syn1[c + l2]))

//x=1-code#作者才此处定义label为1-code,实际上也可以是code

//log(L) = (1-x) * neu1[c] * syn1[c + l2] -x*log(1 + exp(neu1[c] * syn1[c + l2]))

//对log(L)中的syn1进行偏导，g=(1 -code - p(x))*syn1

//因此会有

//g = (1 - vocab[word].code[d] - f) * alpha;alpha学习速率

syn1[c + l2] += g * neu1[c];

//看到这，是不是有点明白了，这个nn model实际上就是个lr模型，呵呵呵，就写到这了

// 'g' is the gradient multiplied by the learning rate

g = (1 - vocab[word].code[d] - f) * alpha;

// Propagate errors output -> hidden

for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];

// Learn weights hidden -> output

for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c];

}

// NEGATIVE SAMPLING

if (negative > 0) for (d = 0; d < negative + 1; d++) {

if (d == 0) {

target = word;

label = 1;

} else {

next_random = next_random * (unsigned long long)25214903917 + 11;

target = table[(next_random >> 16) % table_size];

if (target == 0) target = next_random % (vocab_size - 1) + 1;

if (target == word) continue;

label = 0;

}

l2 = target * layer1_size;

f = 0;

for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2];

if (f > MAX_EXP) g = (label - 1) * alpha;

else if (f < -MAX_EXP) g = (label - 0) * alpha;

else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;

for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];

for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c];

}

// hidden -> in

for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {

c = sentence_position - window + a;

if (c < 0) continue;

if (c >= sentence_length) continue;

last_word = sen[c];

if (last_word == -1) continue;

for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c];

}

} else { //train skip-gram

for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {

c = sentence_position - window + a;

if (c < 0) continue;

if (c >= sentence_length) continue;

last_word = sen[c];

if (last_word == -1) continue;

l1 = last_word * layer1_size;

for (c = 0; c < layer1_size; c++) neu1e[c] = 0;

// HIERARCHICAL SOFTMAX

if (hs) for (d = 0; d < vocab[word].codelen; d++) {

f = 0;

l2 = vocab[word].point[d] * layer1_size;

// Propagate hidden -> output

for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2];

if (f <= -MAX_EXP) continue;

else if (f >= MAX_EXP) continue;

else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];

// 'g' is the gradient multiplied by the learning rate

g = (1 - vocab[word].code[d] - f) * alpha;

// Propagate errors output -> hidden

for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];

// Learn weights hidden -> output

for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1];

}

// NEGATIVE SAMPLING

if (negative > 0) for (d = 0; d < negative + 1; d++) {

if (d == 0) {

target = word;

label = 1;

} else {

next_random = next_random * (unsigned long long)25214903917 + 11;

target = table[(next_random >> 16) % table_size];

if (target == 0) target = next_random % (vocab_size - 1) + 1;

if (target == word) continue;

label = 0;

}

l2 = target * layer1_size;

f = 0;

for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2];

if (f > MAX_EXP) g = (label - 1) * alpha;

else if (f < -MAX_EXP) g = (label - 0) * alpha;

else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;

for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];

for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * syn0[c + l1];

}

// Learn weights input -> hidden

for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c];

}

sentence_position++;

if (sentence_position >= sentence_length) {

sentence_length = 0;

continue;

}

fclose(fi);

free(neu1);

free(neu1e);

pthread_exit(NULL);

}

http://blog.sina.com.cn/s/blog_839cd44d0101flea.html

秒客网

word2vec核心代码注释

相关文章