Source code for sourced.ml.core.models.bow

import logging
from typing import Dict, Iterable, List

from modelforge import assemble_sparse_matrix, disassemble_sparse_matrix, merge_strings, Model, \
    register_model, split_strings
from modelforge.progress_bar import progress_bar
from scipy import sparse

from sourced.ml.core.models.df import DocumentFrequencies
from sourced.ml.core.models.license import DEFAULT_LICENSE


[docs]@register_model
class BOW(Model):
    """
    Weighted bag of words model. Every word is correspond to an index and its matrix column.
    Bag is a word set from repository, file or anything else.
    Word is source code identifier or its part.
    This model depends on :class:`sourced.ml.models.DocumentFrequencies`.
    """
[docs]    NAME = "bow"
[docs]    VENDOR = "source{d}"
[docs]    DESCRIPTION = "Model that contains source code as weighted bag of words."
[docs]    LICENSE = DEFAULT_LICENSE

[docs]    def construct(self, documents: List[str], tokens: List[str], matrix: sparse.spmatrix):
        if matrix.shape[0] != len(documents):
            raise ValueError("matrix shape mismatch, documents %d != %d" % (
                matrix.shape[0], len(documents)))
        if matrix.shape[1] != len(tokens):
            raise ValueError("matrix shape mismatch, tokens %d != %d" % (
                matrix.shape[1], len(tokens)))
        self._documents = documents
        self._matrix = matrix
        self._tokens = tokens
        return self

[docs]    def dump(self):
        return "Shape: %s\n" \
               "First 10 documents: %s\n" \
               "First 10 tokens: %s" % \
               (self._matrix.shape, self._documents[:10], self.tokens[:10])

    @property
[docs]    def matrix(self) -> sparse.spmatrix:
        """
        Returns the bags as a sparse matrix. Rows are documents and columns are tokens weight.
        """
        return self._matrix

    @property
[docs]    def documents(self):
        """
        The list of documents in the model.
        """
        return self._documents

    @property
[docs]    def tokens(self):
        """
        The list of tokens in the model.
        """
        return self._tokens

    def __getitem__(self, item: int):
        """
        Returns document name, word indices and weights for the given document index.

        :param item: Document index.
        :return: (name, :class:`numpy.ndarray` with word indices, \
                  :class:`numpy.ndarray` with weights)
        """
        data = self._matrix[item]
        return self._documents[item], data.indices, data.data

    def __iter__(self):
        """
        Returns an iterator over the document indices.
        """
        return iter(range(len(self)))

    def __len__(self):
        """
        Returns the number of documents.
        """
        return len(self._documents)

[docs]    def save(self, output: str, series: str, deps: Iterable = tuple(),
             create_missing_dirs: bool = True):
        if not deps:
            try:
                deps = [self.get_dep(DocumentFrequencies.NAME)]
            except KeyError:
                raise ValueError(
                    "You must specify DocumentFrequencies dependency to save BOW.") from None
        super().save(output=output, series=series, deps=deps,
                     create_missing_dirs=create_missing_dirs)

[docs]    def convert_bow_to_vw(self, output: str):
        log = logging.getLogger("bow2vw")
        log.info("Writing %s", output)
        with open(output, "w") as fout:
            for index in progress_bar(self, log, expected_size=len(self)):
                record = self[index]
                fout.write(record[0].replace(":", "").replace(" ", "_") + " ")
                pairs = []
                for t, v in zip(*record[1:]):
                    try:
                        word = self.tokens[t]
                    except (KeyError, IndexError):
                        log.warning("%d not found in the vocabulary", t)
                        continue
                    pairs.append("%s:%s" % (word, v))
                fout.write(" ".join(pairs))
                fout.write("\n")

[docs]    def documents_index(self) -> Dict[str, int]:
        return {r: i for i, r in enumerate(self._documents)}

    def _generate_tree(self):
        return {"documents": merge_strings(self._documents),
                "matrix": disassemble_sparse_matrix(self._matrix),
                "tokens": merge_strings(self.tokens)}

    def _load_tree_kwargs(self, tree: dict):
        return {"documents": split_strings(tree["documents"]),
                "matrix": assemble_sparse_matrix(tree["matrix"]),
                "tokens": split_strings(tree["tokens"])}

    def _load_tree(self, tree: dict):
        self.construct(**self._load_tree_kwargs(tree))
Source code for sourced.ml.core.models.bow

sourced.ml.core

Navigation

Related Topics