Source code for sourced.ml.core.algorithms.id_embedding

import numpy


[docs]def extract_coocc_matrix(global_shape, word_indices, model): # Stage 1 - extract the tokens, map them to the global vocabulary indices = [] mapped_indices = [] for i, w in enumerate(model.tokens): gi = word_indices.get(w) if gi is not None: indices.append(i) mapped_indices.append(gi) indices = numpy.array(indices) mapped_indices = numpy.array(mapped_indices) # Stage 2 - sort the matched tokens by the index in the vocabulary order = numpy.argsort(mapped_indices) indices = indices[order] mapped_indices = mapped_indices[order] # Stage 3 - produce the csr_matrix with the matched tokens **only** matrix = model.matrix.tocsr()[indices][:, indices] # Stage 4 - convert this matrix to the global (ccmatrix) coordinates csr_indices = matrix.indices for i, v in enumerate(csr_indices): # Here we use the fact that indices and mapped_indices are in the same order csr_indices[i] = mapped_indices[v] csr_indptr = matrix.indptr new_indptr = [0] for i, v in enumerate(mapped_indices): prev_ptr = csr_indptr[i] ptr = csr_indptr[i + 1] # Handle missing rows prev = (mapped_indices[i - 1] + 1) if i > 0 else 0 for _ in range(prev, v): new_indptr.append(prev_ptr) new_indptr.append(ptr) for _ in range(mapped_indices[-1] + 1, global_shape[0]): new_indptr.append(csr_indptr[-1]) matrix.indptr = numpy.array(new_indptr) matrix._shape = global_shape return matrix