Source code for sourced.ml.core.extractors.literals
import codecs
from collections import defaultdict
import os
from sourced.ml.core.algorithms.uast_ids_to_bag import uast2sequence, UastIds2Bag
from sourced.ml.core.extractors.bags_extractor import BagsExtractor
from sourced.ml.core.extractors.helpers import register_extractor
from sourced.ml.core.utils import bblfsh_roles
[docs]class HashedTokenParser:
[docs] def process_token(self, token):
yield codecs.encode((hash(token) & 0xffffffffffffffff).to_bytes(8, "little"),
"hex_codec").decode()
[docs]class Literals2Bag(UastIds2Bag):
"""
Converts a UAST to a bag-of-literals.
"""
[docs] XPATH = "//*[@roleLiteral]"
def __init__(self, token2index=None, token_parser=None):
"""
:param token2index: The mapping from tokens to bag keys. If None, no mapping is performed.
:param token_parser: Specify token parser if you want to use a custom one. \
:class:'TokenParser' is used if it is not specified.
"""
token_parser = HashedTokenParser() if token_parser is None else token_parser
super().__init__(token2index, token_parser)
def __call__(self, uast):
"""
HOTFIX for https://github.com/bblfsh/client-python/issues/92
Converts a UAST to a weighed bag-of-literals. The weights are literals frequencies.
The tokens are preprocessed by _token_parser.
Overwrite __call__ to avoid issues with `bblfsh.filter`.
:param uast: The UAST root node.
:return: bag
"""
nodes = [node for node in uast2sequence(uast) if bblfsh_roles.LITERAL in node.roles]
bag = defaultdict(int)
for node in nodes:
for sub in self._token_parser.process_token(node.token):
try:
bag[self._token2index[sub]] += 1
except KeyError:
continue
return bag