From 0752838afccbdfb851f6d675ff1c20691a4aaf02 Mon Sep 17 00:00:00 2001
From: Timofey Arkhangelskiy <timofey.arkhangelskiy@uni-hamburg.de>
Date: Fri, 23 Dec 2022 21:00:37 +0100
Subject: [PATCH] Add conversion mechanism for corpus-specific POS

---
 common/config.py                    |  9 +++++----
 common/tsakorpus_response_parser.py | 24 +++++++++++++++++++++++-
 config/test.json                    |  6 +++++-
 3 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/common/config.py b/common/config.py
index df1191c..40206b2 100644
--- a/common/config.py
+++ b/common/config.py
@@ -40,6 +40,7 @@ class ResourceConfig:
         self.supported_layers = []
         self.resources = []
         self.search_lang_id = ''
+        self.pos_convert = {}
 
         self.query_timeout = 60
 
@@ -47,16 +48,16 @@ class ResourceConfig:
                               if type(self.__dict__[k]) == bool)
         self.intParams = set(k for k in self.__dict__
                              if type(self.__dict__[k]) == int)
-        self.lsParams = {}
+        self.lsParams = set()
 
         # dictionaries where values are strings
-        self.dict_sParams = {}
+        self.dict_sParams = {'pos_convert'}
 
         # dictionaries where values are lists of strings
-        self.dict_lsParams = {}
+        self.dict_lsParams = set()
 
         # dictionaries where values are dictionaries {k: string}
-        self.dict_dParams = {}
+        self.dict_dParams = set()
 
         if fnameConfig is not None and os.path.exists(fnameConfig):
             self.load_settings(fnameConfig)
diff --git a/common/tsakorpus_response_parser.py b/common/tsakorpus_response_parser.py
index 2af8482..d70b8a4 100644
--- a/common/tsakorpus_response_parser.py
+++ b/common/tsakorpus_response_parser.py
@@ -9,12 +9,32 @@ from .search_retrieve import Record
 from .diagnostics import Diagnostic, DiagnosticTypes
 
 
+class POSConvertor:
+    """
+    Convert corpus-specific parts of speech / grammar tags to
+    UPOS, using regexes correspondences set in the config.
+    """
+    def __init__(self, config: ResourceConfig):
+        self.posConvert = config.pos_convert
+        self.posTests = [(re.compile(k), v) for k, v in self.posConvert]
+
+    def convert_pos(self, pos):
+        """
+        Convert corpus-specific POS tags to UPOS, if possible.
+        Ea
+        """
+        for k, v in self.posTests:
+            if k.search(pos) is not None:
+                return v
+        return pos
+
+
 class TsakorpusResponseParser:
     """
     Parses responses from a Tsakorpus instance.
     """
     def __init__(self):
-        pass
+        self.pc = None      # POS convertor, rebuilt with each parse call
 
     def parse_annotation(self, anno, segID, record):
         """
@@ -37,6 +57,7 @@ class TsakorpusResponseParser:
         for node in posNodes:
             if node.text is not None:
                 posText = re.sub('&nbsp;|[  \t\ufeff]+', '', node.text)
+                posText = self.pc.convert_pos(posText)
                 pos.add(posText)
         if len(pos) > 0:
             posStr = '|'.join(p for p in sorted(pos))
@@ -119,6 +140,7 @@ class TsakorpusResponseParser:
         instance. Return a list of Record objects and the total number of
         records found.
         """
+        self.pc = POSConvertor(config)
         nRecords = 0
         if 'n_sentences' in response:
             nRecords = response['n_sentences']
diff --git a/config/test.json b/config/test.json
index cc8503e..5cd0366 100644
--- a/config/test.json
+++ b/config/test.json
@@ -4,5 +4,9 @@
 	"max_hits": 15,
 	"platform": "tsakorpus",
 	"resource_base_url": "http://127.0.0.1:7342",
-	"search_lang_id": "beserman"
+	"search_lang_id": "beserman",
+	"pos_convert": [
+		["\\bN\\b", "NOUN"],
+		["\\bV\\b", "VERB"]
+	]
 }
\ No newline at end of file
-- 
GitLab