Source code for sourced.ml.core.algorithms.token_parser

from enum import Enum
import functools
import re

import Stemmer


[docs]class TokenStyle(Enum):
    """Metadata that should allow to reconstruct initial identifier from a list of tokens."""
[docs]    DELIMITER = 1
[docs]    TOKEN_UPPER = 2
[docs]    TOKEN_LOWER = 3
[docs]    TOKEN_CAPITALIZED = 4


[docs]class TokenParser:
    """
    Common utilities for splitting and stemming tokens.
    """
    # Regexp to split source code identifiers
[docs]    NAME_BREAKUP_RE = re.compile(r"[^a-zA-Z]+")
[docs]    NAME_BREAKUP_KEEP_DELIMITERS_RE = re.compile(r"([^a-zA-Z]+)")  # ... and keep delimiters
    # Example:
    # token = "Var_WithStrangeNAMING__very_strange"
    # NAME_BREAKUP_KEEP_DELIMITERS_RE.split(token) -> ['Var', '_', 'WithStrangeNAMING', '__',
    #                                                  'very', '_', 'strange']
    # NAME_BREAKUP_RE.split(token) -> ['Var', 'WithStrangeNAMING', 'very', 'strange']
[docs]    STEM_THRESHOLD = 6
[docs]    MAX_TOKEN_LENGTH = 256
[docs]    MIN_SPLIT_LENGTH = 3

    def __init__(self, stem_threshold=STEM_THRESHOLD, max_token_length=MAX_TOKEN_LENGTH,
                 min_split_length=MIN_SPLIT_LENGTH, single_shot=False, save_token_style=False,
                 attach_upper=True, use_nn=False, nn_model=None):
        """
        Initialize a new TokenSplitter.

        :param stem_threshold: We do not stem split parts shorter than or equal to this size.
        :param max_token_length: We cut identifiers longer than this value.
        :param min_split_length: We do not split source code identifiers shorter than this value. \
                                 If you do not want to filter small tokens set min_split_length=1.
        :param single_shot: True if we do not want to join small identifiers to next one. \
            Example: 'sourced.ml.algorithms' → ["sourc", "sourcedml", "algorithm", "mlalgorithm"].\
            If True we have only ["sourc", "algorithm"]. \
        :param save_token_style: value indicating whether yield metadata that can be used to \
                                 reconstruct the initial identifier.
        :param attach_upper: True to attach the last of several uppercase letters in a row to \
                      the next token. Example: 'HTMLResponse' -> ["html", "response"] if True, \
                      'HTMLResponse' -> ["htmlr", "esponse"] if False.
        :param use_nn: value indicating whether to use the Neural Network-based splitter instead \
                       of the heuristics.
        :param nn_model: IdentifierSplitterBiLSTM model UUID to load. None means the most recent.
        """
        self._stemmer = Stemmer.Stemmer("english")
        self._stemmer.maxCacheSize = 0
        self._stem_threshold = stem_threshold
        self._max_token_length = max_token_length
        self._min_split_length = min_split_length
        self._single_shot = single_shot
        self._save_token_style = save_token_style
        self._attach_upper = attach_upper
        self._id_splitter_nn = None
        if use_nn:
            self._init_nn(nn_model)
        if self._save_token_style and not self._single_shot:
            raise ValueError("Only one of `single_shot`/`save_token_style` should be True")

    @property
[docs]    def use_nn(self):
        return self._id_splitter_nn is not None

    @property
[docs]    def stem_threshold(self):
        return self._stem_threshold

    @stem_threshold.setter
    def stem_threshold(self, value):
        if not isinstance(value, int):
            raise TypeError("stem_threshold must be an integer - got %s" % type(value))
        if value < 1:
            raise ValueError("stem_threshold must be greater than 0 - got %d" % value)
        self._stem_threshold = value

    @property
[docs]    def max_token_length(self):
        return self._max_token_length

    @max_token_length.setter
    def max_token_length(self, value):
        if not isinstance(value, int):
            raise TypeError("max_token_length must be an integer - got %s" % type(value))
        if value < 1:
            raise ValueError("max_token_length must be greater than 0 - got %d" % value)
        self._max_token_length = value

    def _init_nn(self, nn_model):
        from sourced.ml.core.models.id_splitter import IdentifierSplitterBiLSTM
        self._id_splitter_nn = IdentifierSplitterBiLSTM().load(source=nn_model)

    @property
[docs]    def min_split_length(self):
        return self._min_split_length

    @min_split_length.setter
    def min_split_length(self, value):
        if not isinstance(value, int):
            raise TypeError("min_split_length must be an integer - got %s" % type(value))
        if value < 1:
            raise ValueError("min_split_length must be greater than 0 - got %d" % value)
        self._min_split_length = value

    def __call__(self, token):
        return self.process_token(token)

[docs]    def process_token(self, token):
        for word in self.split(token):
            yield self.stem(word)

[docs]    def stem(self, word):
        if len(word) <= self.stem_threshold:
            return word
        return self._stemmer.stemWord(word)

    def _split(self, token):
        token = token.strip()[:self.max_token_length]

        def meta_decorator(func):
            if self._save_token_style:
                @functools.wraps(func)
                def decorated_func(name):
                    if name.isupper():
                        meta = TokenStyle.TOKEN_UPPER
                    elif name.islower():
                        meta = TokenStyle.TOKEN_LOWER
                    else:
                        meta = TokenStyle.TOKEN_CAPITALIZED
                    for res in func(name):
                        yield res, meta
                return decorated_func
            else:
                return func

        @meta_decorator
        def ret(name):
            r = name.lower()
            if len(name) >= self.min_split_length:
                ret.last_subtoken = r
                yield r
                if ret.prev_p and not self._single_shot:
                    yield ret.prev_p + r
                    ret.prev_p = ""
            elif not self._single_shot:
                ret.prev_p = r
                yield ret.last_subtoken + r
                ret.last_subtoken = ""
        ret.prev_p = ""
        ret.last_subtoken = ""

        if self._save_token_style:
            regexp_splitter = self.NAME_BREAKUP_KEEP_DELIMITERS_RE
        else:
            regexp_splitter = self.NAME_BREAKUP_RE

        for part in regexp_splitter.split(token):
            if not part:
                continue
            if self._save_token_style and not part.isalpha():
                yield part, TokenStyle.DELIMITER
                continue
            assert part.isalpha()
            start = 0
            for i in range(1, len(part)):
                this = part[i]
                prev = part[i - 1]
                if prev.islower() and this.isupper():
                    yield from ret(part[start:i])
                    start = i
                elif prev.isupper() and this.islower():
                    if self._attach_upper and i > 1 and part[i - 2].isupper():
                        new_start = i - 1
                    else:
                        new_start = i
                    if i - 1 > start:
                        yield from ret(part[start:new_start])
                        start = new_start
            last = part[start:]
            if last:
                yield from ret(last)

    def __pre_split_token(self, token: str) -> [str]:
        """Split a token by non-alphanumeric characters"""
        splits = re.split(self.NAME_BREAKUP_RE, token)
        return [splitted_token for splitted_token in splits if splitted_token != ""]

[docs]    def split(self, token: str) -> [str]:
        """
        Splits a single identifier.
        """
        if self.use_nn:
            splitted_token = self.__pre_split_token(token)
            for subtoken in self._id_splitter_nn.split(splitted_token):
                for splitted_subtoken in subtoken:
                    if splitted_subtoken != "":
                        yield splitted_subtoken
        else:
            yield from self._split(token)

[docs]    def split_batch(self, tokens: [str]) -> [[str]]:
        """
        Splits a batch of identifiers.
        """
        if self.use_nn:
            splitted_tokens = []
            for token in tokens:
                splitted_token = self.__pre_split_token(token)
                splitted_tokens.append(splitted_token)
            return self._id_splitter_nn.split(tokens)
        return map(self._split, tokens)

    @staticmethod
[docs]    def reconstruct(tokens):
        res = []
        for t, meta in tokens:
            if meta == TokenStyle.DELIMITER:
                res.append(t)
            elif meta == TokenStyle.TOKEN_LOWER:
                res.append(t.lower())
            elif meta == TokenStyle.TOKEN_UPPER:
                res.append(t.upper())
            elif meta == TokenStyle.TOKEN_CAPITALIZED:
                res.append(t[0].upper() + t[1:])
        return "".join(res)

    def __getstate__(self):
        state = self.__dict__.copy()
        del state["_stemmer"]
        return state

    def __setstate__(self, state):
        self.__dict__ = state
        self._stemmer = Stemmer.Stemmer("english")


[docs]class NoopTokenParser:
    """
    One can use this class one does not want to do any parsing.
    """

[docs]    def process_token(self, token):
        yield token

    def __call__(self, token):
        return self.process_token(token)
Source code for sourced.ml.core.algorithms.token_parser

sourced.ml.core

Navigation

Related Topics