I am trying to create a custom version of word2vec. I want to be able to define that some words that meet certain criteria should stay always in the window either of cbow or skip gram.
I create a new .pyx file based on the word2vec_inner.pyx source code. For instance I modify the skip-gram training as below. You can find between ##### my modifications. The compilation of the cython code to C succeeds but still the my new fast version is not used. When I checked through ipython if I can import the new pyx with pyximport it failed because of not finding some numpy core .h files under /Users//.pyxbld/temp.macosx-10.10-x86_64-2.7/pyrex/numpy/
def train_batch_sg(model, sentences, alpha, _work, compute_loss):
cdef int hs = model.hs
cdef int negative = model.negative
cdef int sample = (model.sample != 0)
cdef int _compute_loss = (1 if compute_loss == True else 0)
cdef REAL_t _running_training_loss = model.running_training_loss
cdef REAL_t *syn0 = <REAL_t *>(np.PyArray_DATA(model.wv.syn0))
cdef REAL_t *word_locks = <REAL_t *>(np.PyArray_DATA(model.syn0_lockf))
cdef REAL_t *work
cdef REAL_t _alpha = alpha
cdef int size = model.layer1_size
cdef int codelens[MAX_SENTENCE_LEN]
#########
cdef int always_codelens[MAX_SENTENCE_LEN]
#########
cdef np.uint32_t indexes[MAX_SENTENCE_LEN]
###########
cdef np.uint32_t always_indexes[MAX_SENTENCE_LEN]
###########
cdef np.uint32_t reduced_windows[MAX_SENTENCE_LEN]
cdef int sentence_idx[MAX_SENTENCE_LEN + 1]
#########
cdef int sentence_always_idx[MAX_SENTENCE_LEN + 1]
#########
cdef int window = model.window
cdef int i, j, k
cdef int effective_words = 0, effective_sentences = 0
#########
cdef int always_effective_words = 0, always_effective_sentences = 0
#########
cdef int sent_idx, idx_start, idx_end
# For hierarchical softmax
cdef REAL_t *syn1
cdef np.uint32_t *points[MAX_SENTENCE_LEN]
cdef np.uint8_t *codes[MAX_SENTENCE_LEN]
cdef np.uint32_t *always_points[MAX_SENTENCE_LEN]
cdef np.uint8_t *always_codes[MAX_SENTENCE_LEN]
# For negative sampling
cdef REAL_t *syn1neg
cdef np.uint32_t *cum_table
cdef unsigned long long cum_table_len
# for sampling (negative and frequent-word downsampling)
cdef unsigned long long next_random
if hs:
syn1 = <REAL_t *>(np.PyArray_DATA(model.syn1))
if negative:
syn1neg = <REAL_t *>(np.PyArray_DATA(model.syn1neg))
cum_table = <np.uint32_t *>(np.PyArray_DATA(model.cum_table))
cum_table_len = len(model.cum_table)
if negative or sample:
next_random = (2**24) * model.random.randint(0, 2**24) + model.random.randint(0, 2**24)
# convert Python structures to primitive types, so we can release the GIL
work = <REAL_t *>np.PyArray_DATA(_work)
# prepare C structures so we can go "full C" and release the Python GIL
vlookup = model.wv.vocab
sentence_idx[0] = 0 # indices of the first sentence always start at 0
sentences_with_always_idx_cnt=0
for sent in sentences:
if not sent:
continue # ignore empty sentences; leave effective_sentences unchanged
for token in sent:
#########
if token in vlookup:
if token.startswith('.'):
word = vlookup[token] if token in vlookup else None
if word is None:
continue # leaving `effective_words` unchanged = shortening the sentence = expanding the window
if sample and word.sample_int < random_int32(&next_random):
continue
always_indexes[always_effective_words] = word.index
if hs:
always_codelens[always_effective_words] = <int>len(word.code)
always_codes[always_effective_words] = <np.uint8_t *>np.PyArray_DATA(word.code)
always_points[always_effective_words] = <np.uint32_t *>np.PyArray_DATA(word.point)
always_effective_words += 1
if always_effective_words == MAX_SENTENCE_LEN:
break # TODO: log warning, tally overflow?
#########
word = vlookup[token] if token in vlookup else None
if word is None:
continue # leaving `effective_words` unchanged = shortening the sentence = expanding the window
if sample and word.sample_int < random_int32(&next_random):
continue
indexes[effective_words] = word.index
if hs:
codelens[effective_words] = <int>len(word.code)
codes[effective_words] = <np.uint8_t *>np.PyArray_DATA(word.code)
points[effective_words] = <np.uint32_t *>np.PyArray_DATA(word.point)
effective_words += 1
if effective_words == MAX_SENTENCE_LEN:
break # TODO: log warning, tally overflow?
# keep track of which words go into which sentence, so we don't train
# across sentence boundaries.
# indices of sentence number X are between <sentence_idx[X], sentence_idx[X])
effective_sentences += 1
sentence_idx[effective_sentences] = effective_words
if effective_words == MAX_SENTENCE_LEN:
break # TODO: log warning, tally overflow?
# precompute "reduced window" offsets in a single randint() call
for i, item in enumerate(model.random.randint(0, window, effective_words)):
reduced_windows[i] = item
# release GIL & train on all sentences
with nogil:
for sent_idx in range(effective_sentences):
idx_start = sentence_idx[sent_idx]
idx_end = sentence_idx[sent_idx + 1]
for i in range(idx_start, idx_end):
j = i - window + reduced_windows[i]
if j < idx_start:
j = idx_start
k = i + window + 1 - reduced_windows[i]
if k > idx_end:
k = idx_end
for j in range(j, k):
if j == i:
continue
if hs:
fast_sentence_sg_hs(points[i], codes[i], codelens[i], syn0, syn1, size, indexes[j], _alpha, work, word_locks, _compute_loss, &_running_training_loss)
if negative:
next_random = fast_sentence_sg_neg(negative, cum_table, cum_table_len, syn0, syn1neg, size, indexes[i], indexes[j], _alpha, work, next_random, word_locks, _compute_loss, &_running_training_loss)
#########
if hs:
fast_sentence_sg_hs(always_points[i], always_codes[i], always_codelens[i], syn0, syn1, size, always_indexes[j], _alpha, work, word_locks, _compute_loss, &_running_training_loss)
if negative:
next_random = fast_sentence_sg_neg(negative, cum_table, cum_table_len, syn0, syn1neg, size, always_indexes[i], always_indexes[j], _alpha, work, next_random, word_locks, _compute_loss, &_running_training_loss)
#########
model.running_training_loss = _running_training_loss
return effective_words