0

I am trying to create a custom version of word2vec. I want to be able to define that some words that meet certain criteria should stay always in the window either of cbow or skip gram.

I create a new .pyx file based on the word2vec_inner.pyx source code. For instance I modify the skip-gram training as below. You can find between ##### my modifications. The compilation of the cython code to C succeeds but still the my new fast version is not used. When I checked through ipython if I can import the new pyx with pyximport it failed because of not finding some numpy core .h files under /Users//.pyxbld/temp.macosx-10.10-x86_64-2.7/pyrex/numpy/

def train_batch_sg(model, sentences, alpha, _work, compute_loss):
    cdef int hs = model.hs
    cdef int negative = model.negative
    cdef int sample = (model.sample != 0)

    cdef int _compute_loss = (1 if compute_loss == True else 0)
    cdef REAL_t _running_training_loss = model.running_training_loss

    cdef REAL_t *syn0 = <REAL_t *>(np.PyArray_DATA(model.wv.syn0))
    cdef REAL_t *word_locks = <REAL_t *>(np.PyArray_DATA(model.syn0_lockf))
    cdef REAL_t *work
    cdef REAL_t _alpha = alpha
    cdef int size = model.layer1_size

    cdef int codelens[MAX_SENTENCE_LEN]
    #########
    cdef int always_codelens[MAX_SENTENCE_LEN]
    #########
    cdef np.uint32_t indexes[MAX_SENTENCE_LEN]
    ###########
    cdef np.uint32_t always_indexes[MAX_SENTENCE_LEN]
    ###########
    cdef np.uint32_t reduced_windows[MAX_SENTENCE_LEN]
    cdef int sentence_idx[MAX_SENTENCE_LEN + 1]
    #########
    cdef int sentence_always_idx[MAX_SENTENCE_LEN + 1]
    #########
    cdef int window = model.window

    cdef int i, j, k
    cdef int effective_words = 0, effective_sentences = 0
    #########
    cdef int always_effective_words = 0, always_effective_sentences = 0
    #########
    cdef int sent_idx, idx_start, idx_end

    # For hierarchical softmax
    cdef REAL_t *syn1
    cdef np.uint32_t *points[MAX_SENTENCE_LEN]
    cdef np.uint8_t *codes[MAX_SENTENCE_LEN]
    cdef np.uint32_t *always_points[MAX_SENTENCE_LEN]
    cdef np.uint8_t *always_codes[MAX_SENTENCE_LEN]

    # For negative sampling
    cdef REAL_t *syn1neg
    cdef np.uint32_t *cum_table
    cdef unsigned long long cum_table_len
    # for sampling (negative and frequent-word downsampling)
    cdef unsigned long long next_random

    if hs:
        syn1 = <REAL_t *>(np.PyArray_DATA(model.syn1))

    if negative:
        syn1neg = <REAL_t *>(np.PyArray_DATA(model.syn1neg))
        cum_table = <np.uint32_t *>(np.PyArray_DATA(model.cum_table))
        cum_table_len = len(model.cum_table)
    if negative or sample:
        next_random = (2**24) * model.random.randint(0, 2**24) + model.random.randint(0, 2**24)

    # convert Python structures to primitive types, so we can release the GIL
    work = <REAL_t *>np.PyArray_DATA(_work)

    # prepare C structures so we can go "full C" and release the Python GIL
    vlookup = model.wv.vocab
    sentence_idx[0] = 0  # indices of the first sentence always start at 0
    sentences_with_always_idx_cnt=0
    for sent in sentences:
        if not sent:
            continue  # ignore empty sentences; leave effective_sentences unchanged
        for token in sent:
            #########
            if token in vlookup:
                if token.startswith('.'):
                    word = vlookup[token] if token in vlookup else None
                    if word is None:
                        continue  # leaving `effective_words` unchanged = shortening the sentence = expanding the window
                    if sample and word.sample_int < random_int32(&next_random):
                        continue
                    always_indexes[always_effective_words] = word.index
                    if hs:
                        always_codelens[always_effective_words] = <int>len(word.code)
                        always_codes[always_effective_words] = <np.uint8_t *>np.PyArray_DATA(word.code)
                        always_points[always_effective_words] = <np.uint32_t *>np.PyArray_DATA(word.point)
                    always_effective_words += 1
                    if always_effective_words == MAX_SENTENCE_LEN:
                        break  # TODO: log warning, tally overflow?
            #########
            word = vlookup[token] if token in vlookup else None
            if word is None:
                continue  # leaving `effective_words` unchanged = shortening the sentence = expanding the window
            if sample and word.sample_int < random_int32(&next_random):
                continue
            indexes[effective_words] = word.index
            if hs:
                codelens[effective_words] = <int>len(word.code)
                codes[effective_words] = <np.uint8_t *>np.PyArray_DATA(word.code)
                points[effective_words] = <np.uint32_t *>np.PyArray_DATA(word.point)
            effective_words += 1
            if effective_words == MAX_SENTENCE_LEN:
                break  # TODO: log warning, tally overflow?

        # keep track of which words go into which sentence, so we don't train
        # across sentence boundaries.
        # indices of sentence number X are between <sentence_idx[X], sentence_idx[X])
        effective_sentences += 1
        sentence_idx[effective_sentences] = effective_words

        if effective_words == MAX_SENTENCE_LEN:
            break  # TODO: log warning, tally overflow?

    # precompute "reduced window" offsets in a single randint() call
    for i, item in enumerate(model.random.randint(0, window, effective_words)):
        reduced_windows[i] = item

    # release GIL & train on all sentences
    with nogil:
        for sent_idx in range(effective_sentences):
            idx_start = sentence_idx[sent_idx]
            idx_end = sentence_idx[sent_idx + 1]
            for i in range(idx_start, idx_end):
                j = i - window + reduced_windows[i]
                if j < idx_start:
                    j = idx_start
                k = i + window + 1 - reduced_windows[i]
                if k > idx_end:
                    k = idx_end
                for j in range(j, k):
                    if j == i:
                        continue
                    if hs:
                        fast_sentence_sg_hs(points[i], codes[i], codelens[i], syn0, syn1, size, indexes[j], _alpha, work, word_locks, _compute_loss, &_running_training_loss)
                    if negative:
                        next_random = fast_sentence_sg_neg(negative, cum_table, cum_table_len, syn0, syn1neg, size, indexes[i], indexes[j], _alpha, work, next_random, word_locks, _compute_loss, &_running_training_loss)
                #########
                if hs:
                    fast_sentence_sg_hs(always_points[i], always_codes[i], always_codelens[i], syn0, syn1, size, always_indexes[j], _alpha, work, word_locks, _compute_loss, &_running_training_loss)
                if negative:
                    next_random = fast_sentence_sg_neg(negative, cum_table, cum_table_len, syn0, syn1neg, size, always_indexes[i], always_indexes[j], _alpha, work, next_random, word_locks, _compute_loss, &_running_training_loss)
                #########
    model.running_training_loss = _running_training_loss
    return effective_words
Themis Mavridis
  • 166
  • 2
  • 15
  • Possible duplicate of [Cython: "fatal error: numpy/arrayobject.h: No such file or directory"](https://stackoverflow.com/questions/14657375/cython-fatal-error-numpy-arrayobject-h-no-such-file-or-directory) – DavidW Oct 17 '17 at 18:20
  • Specifically [this answer](https://stackoverflow.com/a/14678559/4657412) for `pyximport` – DavidW Oct 17 '17 at 18:20
  • I've given some generic ideas for your overall needs on the gensim discussion list, where you also asked this question. The specific error you're getting when trying your import in a notebook suggests there may be issues with what's available in your overall environment, and seems a strange error to get at import-time (rather than compile-time). You should reproduce what you tried and saw, exactly, here. Also, are you using a virtual environment, rather than the system Python installation? If not, that's a very good idea for compartmentalization & clarity over what's the active/findable code. – gojomo Oct 17 '17 at 19:12

0 Answers0