Source code for sourced.ml.core.algorithms.id_embedding

import numpy


[docs]def extract_coocc_matrix(global_shape, word_indices, model):
    # Stage 1 - extract the tokens, map them to the global vocabulary
    indices = []
    mapped_indices = []
    for i, w in enumerate(model.tokens):
        gi = word_indices.get(w)
        if gi is not None:
            indices.append(i)
            mapped_indices.append(gi)
    indices = numpy.array(indices)
    mapped_indices = numpy.array(mapped_indices)
    # Stage 2 - sort the matched tokens by the index in the vocabulary
    order = numpy.argsort(mapped_indices)
    indices = indices[order]
    mapped_indices = mapped_indices[order]
    # Stage 3 - produce the csr_matrix with the matched tokens **only**
    matrix = model.matrix.tocsr()[indices][:, indices]
    # Stage 4 - convert this matrix to the global (ccmatrix) coordinates
    csr_indices = matrix.indices
    for i, v in enumerate(csr_indices):
        # Here we use the fact that indices and mapped_indices are in the same order
        csr_indices[i] = mapped_indices[v]
    csr_indptr = matrix.indptr
    new_indptr = [0]
    for i, v in enumerate(mapped_indices):
        prev_ptr = csr_indptr[i]
        ptr = csr_indptr[i + 1]

        # Handle missing rows
        prev = (mapped_indices[i - 1] + 1) if i > 0 else 0
        for _ in range(prev, v):
            new_indptr.append(prev_ptr)

        new_indptr.append(ptr)
    for _ in range(mapped_indices[-1] + 1, global_shape[0]):
        new_indptr.append(csr_indptr[-1])
    matrix.indptr = numpy.array(new_indptr)
    matrix._shape = global_shape
    return matrix
Source code for sourced.ml.core.algorithms.id_embedding

sourced.ml.core

Navigation

Related Topics