From 0752838afccbdfb851f6d675ff1c20691a4aaf02 Mon Sep 17 00:00:00 2001 From: Timofey Arkhangelskiy <timofey.arkhangelskiy@uni-hamburg.de> Date: Fri, 23 Dec 2022 21:00:37 +0100 Subject: [PATCH] Add conversion mechanism for corpus-specific POS --- common/config.py | 9 +++++---- common/tsakorpus_response_parser.py | 24 +++++++++++++++++++++++- config/test.json | 6 +++++- 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/common/config.py b/common/config.py index df1191c..40206b2 100644 --- a/common/config.py +++ b/common/config.py @@ -40,6 +40,7 @@ class ResourceConfig: self.supported_layers = [] self.resources = [] self.search_lang_id = '' + self.pos_convert = {} self.query_timeout = 60 @@ -47,16 +48,16 @@ class ResourceConfig: if type(self.__dict__[k]) == bool) self.intParams = set(k for k in self.__dict__ if type(self.__dict__[k]) == int) - self.lsParams = {} + self.lsParams = set() # dictionaries where values are strings - self.dict_sParams = {} + self.dict_sParams = {'pos_convert'} # dictionaries where values are lists of strings - self.dict_lsParams = {} + self.dict_lsParams = set() # dictionaries where values are dictionaries {k: string} - self.dict_dParams = {} + self.dict_dParams = set() if fnameConfig is not None and os.path.exists(fnameConfig): self.load_settings(fnameConfig) diff --git a/common/tsakorpus_response_parser.py b/common/tsakorpus_response_parser.py index 2af8482..d70b8a4 100644 --- a/common/tsakorpus_response_parser.py +++ b/common/tsakorpus_response_parser.py @@ -9,12 +9,32 @@ from .search_retrieve import Record from .diagnostics import Diagnostic, DiagnosticTypes +class POSConvertor: + """ + Convert corpus-specific parts of speech / grammar tags to + UPOS, using regexes correspondences set in the config. + """ + def __init__(self, config: ResourceConfig): + self.posConvert = config.pos_convert + self.posTests = [(re.compile(k), v) for k, v in self.posConvert] + + def convert_pos(self, pos): + """ + Convert corpus-specific POS tags to UPOS, if possible. + Ea + """ + for k, v in self.posTests: + if k.search(pos) is not None: + return v + return pos + + class TsakorpusResponseParser: """ Parses responses from a Tsakorpus instance. """ def __init__(self): - pass + self.pc = None # POS convertor, rebuilt with each parse call def parse_annotation(self, anno, segID, record): """ @@ -37,6 +57,7 @@ class TsakorpusResponseParser: for node in posNodes: if node.text is not None: posText = re.sub(' |[ \t\ufeff]+', '', node.text) + posText = self.pc.convert_pos(posText) pos.add(posText) if len(pos) > 0: posStr = '|'.join(p for p in sorted(pos)) @@ -119,6 +140,7 @@ class TsakorpusResponseParser: instance. Return a list of Record objects and the total number of records found. """ + self.pc = POSConvertor(config) nRecords = 0 if 'n_sentences' in response: nRecords = response['n_sentences'] diff --git a/config/test.json b/config/test.json index cc8503e..5cd0366 100644 --- a/config/test.json +++ b/config/test.json @@ -4,5 +4,9 @@ "max_hits": 15, "platform": "tsakorpus", "resource_base_url": "http://127.0.0.1:7342", - "search_lang_id": "beserman" + "search_lang_id": "beserman", + "pos_convert": [ + ["\\bN\\b", "NOUN"], + ["\\bV\\b", "VERB"] + ] } \ No newline at end of file -- GitLab