Source code for sourced.ml.core.models.id_splitter

import string
from typing import Dict, List, Sequence, Tuple

from modelforge import Model, register_model
import numpy
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sourced.ml.core.algorithms.id_splitter.nn_model import (f1score, precision,
                                                             recall)
from sourced.ml.core.models.license import DEFAULT_LICENSE


[docs]@register_model class IdentifierSplitterBiLSTM(Model): """ Bidirectional LSTM Model. Splits identifiers without need for a conventional pattern. Reference: https://arxiv.org/abs/1805.11651 """
[docs] NAME = "id_splitter_bilstm"
[docs] VENDOR = "source{d}"
[docs] DESCRIPTION = "Weights of the BiLSTM network to split source code identifiers."
[docs] LICENSE = DEFAULT_LICENSE
[docs] DEFAULT_MAXLEN = 40
[docs] DEFAULT_PADDING = "post"
[docs] DEFAULT_MAPPING = {c: i for i, c in enumerate(string.ascii_lowercase, start=1)}
[docs] DEFAULT_BATCH_SIZE = 4096
def __init__(self, **kwargs): super().__init__(**kwargs) self._maxlen = self.DEFAULT_MAXLEN self._padding = self.DEFAULT_PADDING self._mapping = self.DEFAULT_MAPPING self._model = None self._batch_size = self.DEFAULT_BATCH_SIZE
[docs] def construct(self, model: keras.models.Model, maxlen: int = DEFAULT_MAXLEN, padding: str = DEFAULT_PADDING, mapping: Dict[str, int] = DEFAULT_MAPPING, batch_size: int = DEFAULT_BATCH_SIZE) -> "IdentifierSplitterBiLSTM": """ Construct IdentifierSplitterBiLSTM model. :param model: keras model used for identifier splitting. :param maxlen: Maximum length of input identifers. :param padding: Where to pad the identifiers of length < maxlen. Can be "left" or "right". :param mapping: Mapping of characters to integers. :param batch_size: Batch size of input data fed to the model. :return: BiLSTM based source code identifier splitter. """ self._maxlen = maxlen self._padding = padding self._mapping = mapping self._model = model self._batch_size = batch_size return self
@property
[docs] def model(self) -> "keras.models.Model": """ Return the wrapped keras model. """ return self._model
@property
[docs] def batch_size(self) -> int: """ Return the batch size used to run the model. """ return self._batch_size
def _generate_tree(self) -> dict: return { "config": self._model.get_config(), "weights": self._model.get_weights(), "mapping": self._mapping, "maxlen": self._maxlen, "padding": self._padding, } def _load_tree(self, tree: dict) -> None: model = keras.models.Model.from_config(tree["config"]) model.set_weights(tree["weights"]) self.construct(model, maxlen=tree["maxlen"], padding=tree["padding"], mapping=tree["mapping"])
[docs] def dump(self) -> str: return "BiLSTM identifier splitter with %d maxlen and %d batch size" % (self._maxlen, self._batch_size)
def _prepare_single_identifier(self, identifier: str) -> Tuple[numpy.array, str]: # Clean identifier clean_id = "".join(char for char in identifier.lower() if char in self._mapping) if len(clean_id) > self._maxlen: clean_id = clean_id[:self._maxlen] self._log.debug("Preprocessed identifier: %s : %s", identifier, clean_id) return numpy.array([self._mapping[c] for c in clean_id]), clean_id
[docs] def prepare_input(self, identifiers: Sequence[str]) -> Tuple[numpy.array, List[str]]: """ Prepare input by converting a sequence of identifiers to the corresponding \ ascii code 2D-array and the list of lowercase cleaned identifiers. """ processed_ids = [] clean_ids = [] for identifier in identifiers: feat, clean_id = self._prepare_single_identifier(identifier) processed_ids.append(feat) clean_ids.append(clean_id) processed_ids = pad_sequences(processed_ids, maxlen=self._maxlen, padding=self._padding) return processed_ids, clean_ids
[docs] def load_model_file(self, path: str): """ Load a compatible Keras model file. Used for compatibility. """ self._model = keras.models.load_model(path, custom_objects={"precision": precision, "recall": recall, "f1score": f1score})
[docs] def split(self, identifiers: Sequence[str]) -> List[List[str]]: """ Split identifiers in a list, using the model. """ feats, clean_ids = self.prepare_input(identifiers) output = self._model.predict(feats, batch_size=self._batch_size) output = numpy.round(output)[:, :, 0] for clean_id, id_output in zip(clean_ids, output): identifier_tokens = [] token = "" for char, label in zip(clean_id, id_output): if label == 1: identifier_tokens.append(token) token = "" token += char identifier_tokens.append(token) yield identifier_tokens