From 45d619ad3434607481de500bcec7bcb2419c88ff Mon Sep 17 00:00:00 2001 From: Hagen Peukert <hagen.peukert@uni-hamburg.de> Date: Fri, 1 Apr 2022 10:28:30 +0200 Subject: [PATCH] Lemmatization implemented --- Morphochron/src/Init.java | 5 ++++- Morphochron/src/OED.java | 20 +++++++++++++++----- Morphochron/src/SuffixEnum.java | 6 +++--- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/Morphochron/src/Init.java b/Morphochron/src/Init.java index f1a3941..226c6ab 100644 --- a/Morphochron/src/Init.java +++ b/Morphochron/src/Init.java @@ -92,7 +92,10 @@ public class Init // normalizedWords.add("dignitary"); // normalizedWords.add("proposition"); // normalizedWords.add("daskommtnichtvor"); -// +// normalizedWords.add("annoyaunce"); +// normalizedWords.add("assygnement"); +// normalizedWords.add("daskommtnichtvor"); + frame.setMessage("All words of type " + wordclass + " selected\n"); //detect affixes in word list as a pre-processing and countercheck these with OED REST API diff --git a/Morphochron/src/OED.java b/Morphochron/src/OED.java index 7369afc..52ea05f 100644 --- a/Morphochron/src/OED.java +++ b/Morphochron/src/OED.java @@ -54,6 +54,14 @@ public class OED this.resultPath = resultPath; } + /* + * gets lemmatized word representation of OED REST API as JSON object + */ + private String getRESTAPILemmatizedWord(String word) + { + return "https://oed-researcher-api.oxfordlanguages.com/oed/api/v0.2/lemmatize/?form=" + word;// + "&part_of_speech=" + wordclass; + } + /* * gets word representation of OED REST API as JSON object */ @@ -80,11 +88,11 @@ public class OED JSONArray arr = obj.getJSONArray("data"); for (int i = 0; i < arr.length(); i++) { - wordid = arr.getJSONObject(i).getString("id"); + wordid = arr.getJSONObject(i).getJSONObject("word").getString("id"); //System.out.println("Wort-ID:" + wordid); - if (wordid.equals(word + wordclass)) break; //words may be part of several word classes + if (wordid.endsWith(wordclass)) break; //words may be part of several word classes } - + return wordid; } @@ -181,7 +189,8 @@ public class OED { //Map<String, Integer> oedData = new HashMap<String,Integer>(); Boolean entryAvailable = false; - String wordJSON = getRESTAPIWordRepresentation(word.toLowerCase()); + //String wordJSON = getRESTAPIWordRepresentation(word.toLowerCase()); + String wordJSON = getRESTAPILemmatizedWord(word.toLowerCase()); JSONObject jo = getJSonResponse(wordJSON); String id = processJSonWordID(jo); if (!id.isEmpty()) @@ -199,7 +208,8 @@ public class OED public Set<String> getMorphemesOEDRequest() { - String wordJSON = getRESTAPIWordRepresentation(word.toLowerCase()); + //String wordJSON = getRESTAPIWordRepresentation(word.toLowerCase()); + String wordJSON = getRESTAPILemmatizedWord(word.toLowerCase()); JSONObject jo = getJSonResponse(wordJSON); String id = processJSonWordID(jo); Set<String> affixes = new HashSet<String>(); diff --git a/Morphochron/src/SuffixEnum.java b/Morphochron/src/SuffixEnum.java index 2093588..8106fb2 100644 --- a/Morphochron/src/SuffixEnum.java +++ b/Morphochron/src/SuffixEnum.java @@ -88,9 +88,9 @@ public enum SuffixEnum { ward("ward"), wards("wards"), ware("ware"), uaeras("ware"), uaras("ware"), uaro("ware"), waeras("ware"), wara("ware"), waran("ware"), waras("ware"), waru("ware"), wearan("ware"), waeren("ware"), warae("ware"), wick("wick"), y("y"), ig("y"), ye("y"), igan("y"), izen("y"), ezen("y"), yen("y"), ey("y"), yl("yl"), yne("yne"); - private final String morpheme; - - //constructor + + private String morpheme; + SuffixEnum(String morpheme) { this.morpheme = morpheme; } -- GitLab