from enum import Enum
import functools
import re
import Stemmer
[docs]class TokenStyle(Enum):
"""Metadata that should allow to reconstruct initial identifier from a list of tokens."""
[docs]class TokenParser:
"""
Common utilities for splitting and stemming tokens.
"""
# Regexp to split source code identifiers
[docs] NAME_BREAKUP_RE = re.compile(r"[^a-zA-Z]+")
[docs] NAME_BREAKUP_KEEP_DELIMITERS_RE = re.compile(r"([^a-zA-Z]+)") # ... and keep delimiters
# Example:
# token = "Var_WithStrangeNAMING__very_strange"
# NAME_BREAKUP_KEEP_DELIMITERS_RE.split(token) -> ['Var', '_', 'WithStrangeNAMING', '__',
# 'very', '_', 'strange']
# NAME_BREAKUP_RE.split(token) -> ['Var', 'WithStrangeNAMING', 'very', 'strange']
def __init__(self, stem_threshold=STEM_THRESHOLD, max_token_length=MAX_TOKEN_LENGTH,
min_split_length=MIN_SPLIT_LENGTH, single_shot=False, save_token_style=False,
attach_upper=True, use_nn=False, nn_model=None):
"""
Initialize a new TokenSplitter.
:param stem_threshold: We do not stem split parts shorter than or equal to this size.
:param max_token_length: We cut identifiers longer than this value.
:param min_split_length: We do not split source code identifiers shorter than this value. \
If you do not want to filter small tokens set min_split_length=1.
:param single_shot: True if we do not want to join small identifiers to next one. \
Example: 'sourced.ml.algorithms' → ["sourc", "sourcedml", "algorithm", "mlalgorithm"].\
If True we have only ["sourc", "algorithm"]. \
:param save_token_style: value indicating whether yield metadata that can be used to \
reconstruct the initial identifier.
:param attach_upper: True to attach the last of several uppercase letters in a row to \
the next token. Example: 'HTMLResponse' -> ["html", "response"] if True, \
'HTMLResponse' -> ["htmlr", "esponse"] if False.
:param use_nn: value indicating whether to use the Neural Network-based splitter instead \
of the heuristics.
:param nn_model: IdentifierSplitterBiLSTM model UUID to load. None means the most recent.
"""
self._stemmer = Stemmer.Stemmer("english")
self._stemmer.maxCacheSize = 0
self._stem_threshold = stem_threshold
self._max_token_length = max_token_length
self._min_split_length = min_split_length
self._single_shot = single_shot
self._save_token_style = save_token_style
self._attach_upper = attach_upper
self._id_splitter_nn = None
if use_nn:
self._init_nn(nn_model)
if self._save_token_style and not self._single_shot:
raise ValueError("Only one of `single_shot`/`save_token_style` should be True")
@property
[docs] def use_nn(self):
return self._id_splitter_nn is not None
@property
[docs] def stem_threshold(self):
return self._stem_threshold
@stem_threshold.setter
def stem_threshold(self, value):
if not isinstance(value, int):
raise TypeError("stem_threshold must be an integer - got %s" % type(value))
if value < 1:
raise ValueError("stem_threshold must be greater than 0 - got %d" % value)
self._stem_threshold = value
@property
[docs] def max_token_length(self):
return self._max_token_length
@max_token_length.setter
def max_token_length(self, value):
if not isinstance(value, int):
raise TypeError("max_token_length must be an integer - got %s" % type(value))
if value < 1:
raise ValueError("max_token_length must be greater than 0 - got %d" % value)
self._max_token_length = value
def _init_nn(self, nn_model):
from sourced.ml.core.models.id_splitter import IdentifierSplitterBiLSTM
self._id_splitter_nn = IdentifierSplitterBiLSTM().load(source=nn_model)
@property
[docs] def min_split_length(self):
return self._min_split_length
@min_split_length.setter
def min_split_length(self, value):
if not isinstance(value, int):
raise TypeError("min_split_length must be an integer - got %s" % type(value))
if value < 1:
raise ValueError("min_split_length must be greater than 0 - got %d" % value)
self._min_split_length = value
def __call__(self, token):
return self.process_token(token)
[docs] def process_token(self, token):
for word in self.split(token):
yield self.stem(word)
[docs] def stem(self, word):
if len(word) <= self.stem_threshold:
return word
return self._stemmer.stemWord(word)
def _split(self, token):
token = token.strip()[:self.max_token_length]
def meta_decorator(func):
if self._save_token_style:
@functools.wraps(func)
def decorated_func(name):
if name.isupper():
meta = TokenStyle.TOKEN_UPPER
elif name.islower():
meta = TokenStyle.TOKEN_LOWER
else:
meta = TokenStyle.TOKEN_CAPITALIZED
for res in func(name):
yield res, meta
return decorated_func
else:
return func
@meta_decorator
def ret(name):
r = name.lower()
if len(name) >= self.min_split_length:
ret.last_subtoken = r
yield r
if ret.prev_p and not self._single_shot:
yield ret.prev_p + r
ret.prev_p = ""
elif not self._single_shot:
ret.prev_p = r
yield ret.last_subtoken + r
ret.last_subtoken = ""
ret.prev_p = ""
ret.last_subtoken = ""
if self._save_token_style:
regexp_splitter = self.NAME_BREAKUP_KEEP_DELIMITERS_RE
else:
regexp_splitter = self.NAME_BREAKUP_RE
for part in regexp_splitter.split(token):
if not part:
continue
if self._save_token_style and not part.isalpha():
yield part, TokenStyle.DELIMITER
continue
assert part.isalpha()
start = 0
for i in range(1, len(part)):
this = part[i]
prev = part[i - 1]
if prev.islower() and this.isupper():
yield from ret(part[start:i])
start = i
elif prev.isupper() and this.islower():
if self._attach_upper and i > 1 and part[i - 2].isupper():
new_start = i - 1
else:
new_start = i
if i - 1 > start:
yield from ret(part[start:new_start])
start = new_start
last = part[start:]
if last:
yield from ret(last)
def __pre_split_token(self, token: str) -> [str]:
"""Split a token by non-alphanumeric characters"""
splits = re.split(self.NAME_BREAKUP_RE, token)
return [splitted_token for splitted_token in splits if splitted_token != ""]
[docs] def split(self, token: str) -> [str]:
"""
Splits a single identifier.
"""
if self.use_nn:
splitted_token = self.__pre_split_token(token)
for subtoken in self._id_splitter_nn.split(splitted_token):
for splitted_subtoken in subtoken:
if splitted_subtoken != "":
yield splitted_subtoken
else:
yield from self._split(token)
[docs] def split_batch(self, tokens: [str]) -> [[str]]:
"""
Splits a batch of identifiers.
"""
if self.use_nn:
splitted_tokens = []
for token in tokens:
splitted_token = self.__pre_split_token(token)
splitted_tokens.append(splitted_token)
return self._id_splitter_nn.split(tokens)
return map(self._split, tokens)
@staticmethod
[docs] def reconstruct(tokens):
res = []
for t, meta in tokens:
if meta == TokenStyle.DELIMITER:
res.append(t)
elif meta == TokenStyle.TOKEN_LOWER:
res.append(t.lower())
elif meta == TokenStyle.TOKEN_UPPER:
res.append(t.upper())
elif meta == TokenStyle.TOKEN_CAPITALIZED:
res.append(t[0].upper() + t[1:])
return "".join(res)
def __getstate__(self):
state = self.__dict__.copy()
del state["_stemmer"]
return state
def __setstate__(self, state):
self.__dict__ = state
self._stemmer = Stemmer.Stemmer("english")
[docs]class NoopTokenParser:
"""
One can use this class one does not want to do any parsing.
"""
[docs] def process_token(self, token):
yield token
def __call__(self, token):
return self.process_token(token)