Skip to content
Snippets Groups Projects
Commit 0752838a authored by Arkhangelskiy, Timofey's avatar Arkhangelskiy, Timofey
Browse files

Add conversion mechanism for corpus-specific POS

parent 8e4aefee
Branches
No related tags found
No related merge requests found
......@@ -40,6 +40,7 @@ class ResourceConfig:
self.supported_layers = []
self.resources = []
self.search_lang_id = ''
self.pos_convert = {}
self.query_timeout = 60
......@@ -47,16 +48,16 @@ class ResourceConfig:
if type(self.__dict__[k]) == bool)
self.intParams = set(k for k in self.__dict__
if type(self.__dict__[k]) == int)
self.lsParams = {}
self.lsParams = set()
# dictionaries where values are strings
self.dict_sParams = {}
self.dict_sParams = {'pos_convert'}
# dictionaries where values are lists of strings
self.dict_lsParams = {}
self.dict_lsParams = set()
# dictionaries where values are dictionaries {k: string}
self.dict_dParams = {}
self.dict_dParams = set()
if fnameConfig is not None and os.path.exists(fnameConfig):
self.load_settings(fnameConfig)
......
......@@ -9,12 +9,32 @@ from .search_retrieve import Record
from .diagnostics import Diagnostic, DiagnosticTypes
class POSConvertor:
"""
Convert corpus-specific parts of speech / grammar tags to
UPOS, using regexes correspondences set in the config.
"""
def __init__(self, config: ResourceConfig):
self.posConvert = config.pos_convert
self.posTests = [(re.compile(k), v) for k, v in self.posConvert]
def convert_pos(self, pos):
"""
Convert corpus-specific POS tags to UPOS, if possible.
Ea
"""
for k, v in self.posTests:
if k.search(pos) is not None:
return v
return pos
class TsakorpusResponseParser:
"""
Parses responses from a Tsakorpus instance.
"""
def __init__(self):
pass
self.pc = None # POS convertor, rebuilt with each parse call
def parse_annotation(self, anno, segID, record):
"""
......@@ -37,6 +57,7 @@ class TsakorpusResponseParser:
for node in posNodes:
if node.text is not None:
posText = re.sub(' |[ \t\ufeff]+', '', node.text)
posText = self.pc.convert_pos(posText)
pos.add(posText)
if len(pos) > 0:
posStr = '|'.join(p for p in sorted(pos))
......@@ -119,6 +140,7 @@ class TsakorpusResponseParser:
instance. Return a list of Record objects and the total number of
records found.
"""
self.pc = POSConvertor(config)
nRecords = 0
if 'n_sentences' in response:
nRecords = response['n_sentences']
......
......@@ -4,5 +4,9 @@
"max_hits": 15,
"platform": "tsakorpus",
"resource_base_url": "http://127.0.0.1:7342",
"search_lang_id": "beserman"
"search_lang_id": "beserman",
"pos_convert": [
["\\bN\\b", "NOUN"],
["\\bV\\b", "VERB"]
]
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment