Source code for sourced.ml.core.models.coocc
from modelforge.model import (
assemble_sparse_matrix, disassemble_sparse_matrix, merge_strings, Model, split_strings)
from modelforge.models import register_model
from sourced.ml.core.models.license import DEFAULT_LICENSE
[docs]@register_model
class Cooccurrences(Model):
"""
Co-occurrence matrix.
"""
[docs] NAME = "co-occurrences"
[docs] DESCRIPTION = "Model that contains the sparse co-occurrence matrix of source code identifiers."
[docs] LICENSE = DEFAULT_LICENSE
[docs] def construct(self, tokens, matrix):
self._tokens = tokens
self._matrix = matrix
return self
def _load_tree(self, tree):
self.construct(tokens=split_strings(tree["tokens"]),
matrix=assemble_sparse_matrix(tree["matrix"]))
[docs] def dump(self):
return """Number of words: %d
First 10 words: %s
Matrix: shape: %s non-zero: %d""" % (
len(self.tokens), self.tokens[:10], self.matrix.shape, self.matrix.getnnz())
@property
[docs] def tokens(self):
"""
Returns the tokens in the order which corresponds to the matrix's rows and cols.
"""
return self._tokens
@property
[docs] def matrix(self):
"""
Returns the sparse co-occurrence matrix.
"""
return self._matrix
def __len__(self):
"""
Returns the number of tokens in the model.
"""
return len(self._tokens)
def _generate_tree(self):
return {"tokens": merge_strings(self.tokens),
"matrix": disassemble_sparse_matrix(self.matrix)}
[docs] def matrix_to_rdd(self, spark_context: "pyspark.SparkContext") -> "pyspark.RDD":
self._log.info("Convert coocc model to RDD...")
rdd_row = spark_context.parallelize(self._matrix.row)
rdd_col = spark_context.parallelize(self._matrix.col)
rdd_data = spark_context.parallelize(self._matrix.data)
return rdd_row.zip(rdd_col).zip(rdd_data)