Source code for sourced.ml.core.models.id_splitter

import string
from typing import Dict, List, Sequence, Tuple

from modelforge import Model, register_model
import numpy
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sourced.ml.core.algorithms.id_splitter.nn_model import (f1score, precision,
                                                             recall)
from sourced.ml.core.models.license import DEFAULT_LICENSE


[docs]@register_model
class IdentifierSplitterBiLSTM(Model):
    """
    Bidirectional LSTM Model. Splits identifiers without need for a conventional pattern.
    Reference: https://arxiv.org/abs/1805.11651
    """
[docs]    NAME = "id_splitter_bilstm"
[docs]    VENDOR = "source{d}"
[docs]    DESCRIPTION = "Weights of the BiLSTM network to split source code identifiers."
[docs]    LICENSE = DEFAULT_LICENSE

[docs]    DEFAULT_MAXLEN = 40
[docs]    DEFAULT_PADDING = "post"
[docs]    DEFAULT_MAPPING = {c: i for i, c in enumerate(string.ascii_lowercase, start=1)}
[docs]    DEFAULT_BATCH_SIZE = 4096

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self._maxlen = self.DEFAULT_MAXLEN
        self._padding = self.DEFAULT_PADDING
        self._mapping = self.DEFAULT_MAPPING
        self._model = None
        self._batch_size = self.DEFAULT_BATCH_SIZE

[docs]    def construct(self, model: keras.models.Model,
                  maxlen: int = DEFAULT_MAXLEN,
                  padding: str = DEFAULT_PADDING,
                  mapping: Dict[str, int] = DEFAULT_MAPPING,
                  batch_size: int = DEFAULT_BATCH_SIZE) -> "IdentifierSplitterBiLSTM":
        """
        Construct IdentifierSplitterBiLSTM model.

        :param model: keras model used for identifier splitting.
        :param maxlen: Maximum length of input identifers.
        :param padding: Where to pad the identifiers of length < maxlen. Can be "left" or "right".
        :param mapping: Mapping of characters to integers.
        :param batch_size: Batch size of input data fed to the model.
        :return: BiLSTM based source code identifier splitter.
        """
        self._maxlen = maxlen
        self._padding = padding
        self._mapping = mapping
        self._model = model
        self._batch_size = batch_size
        return self

    @property
[docs]    def model(self) -> "keras.models.Model":
        """
        Return the wrapped keras model.
        """
        return self._model

    @property
[docs]    def batch_size(self) -> int:
        """
        Return the batch size used to run the model.
        """
        return self._batch_size

    def _generate_tree(self) -> dict:
        return {
            "config": self._model.get_config(),
            "weights": self._model.get_weights(),
            "mapping": self._mapping,
            "maxlen": self._maxlen,
            "padding": self._padding,
        }

    def _load_tree(self, tree: dict) -> None:
        model = keras.models.Model.from_config(tree["config"])
        model.set_weights(tree["weights"])
        self.construct(model, maxlen=tree["maxlen"],
                       padding=tree["padding"], mapping=tree["mapping"])

[docs]    def dump(self) -> str:
        return "BiLSTM identifier splitter with %d maxlen and %d batch size" % (self._maxlen,
                                                                                self._batch_size)

    def _prepare_single_identifier(self, identifier: str) -> Tuple[numpy.array, str]:
        # Clean identifier
        clean_id = "".join(char for char in identifier.lower() if char in self._mapping)
        if len(clean_id) > self._maxlen:
            clean_id = clean_id[:self._maxlen]
        self._log.debug("Preprocessed identifier: %s : %s", identifier, clean_id)
        return numpy.array([self._mapping[c] for c in clean_id]), clean_id

[docs]    def prepare_input(self, identifiers: Sequence[str]) -> Tuple[numpy.array, List[str]]:
        """
        Prepare input by converting a sequence of identifiers to the corresponding \
        ascii code 2D-array and the list of lowercase cleaned identifiers.
        """
        processed_ids = []
        clean_ids = []
        for identifier in identifiers:
            feat, clean_id = self._prepare_single_identifier(identifier)
            processed_ids.append(feat)
            clean_ids.append(clean_id)

        processed_ids = pad_sequences(processed_ids, maxlen=self._maxlen, padding=self._padding)
        return processed_ids, clean_ids

[docs]    def load_model_file(self, path: str):
        """
        Load a compatible Keras model file. Used for compatibility.
        """
        self._model = keras.models.load_model(path, custom_objects={"precision": precision,
                                                                    "recall": recall,
                                                                    "f1score": f1score})

[docs]    def split(self, identifiers: Sequence[str]) -> List[List[str]]:
        """
        Split identifiers in a list, using the model.
        """
        feats, clean_ids = self.prepare_input(identifiers)
        output = self._model.predict(feats, batch_size=self._batch_size)
        output = numpy.round(output)[:, :, 0]
        for clean_id, id_output in zip(clean_ids, output):
            identifier_tokens = []
            token = ""
            for char, label in zip(clean_id, id_output):
                if label == 1:
                    identifier_tokens.append(token)
                    token = ""
                token += char
            identifier_tokens.append(token)
            yield identifier_tokens
Source code for sourced.ml.core.models.id_splitter

sourced.ml.core

Navigation

Related Topics