Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
F
fcs-clarin-endpoint-hamburg
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Arkhangelskiy, Timofey
fcs-clarin-endpoint-hamburg
Commits
ffb2fcd7
Commit
ffb2fcd7
authored
1 year ago
by
Timofey Arkhangelskiy
Browse files
Options
Downloads
Patches
Plain Diff
Start AnnisQueryParser class
parent
d2836150
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
common/annis_query_parser.py
+221
-0
221 additions, 0 deletions
common/annis_query_parser.py
common/config.py
+1
-0
1 addition, 0 deletions
common/config.py
common/litterae_query_parser.py
+8
-0
8 additions, 0 deletions
common/litterae_query_parser.py
with
230 additions
and
0 deletions
common/annis_query_parser.py
0 → 100644
+
221
−
0
View file @
ffb2fcd7
from
urllib.parse
import
quote
import
re
import
json
import
urllib.request
from
.query_parser
import
QueryParser
from
.config
import
ResourceConfig
from
.diagnostics
import
Diagnostic
,
DiagnosticTypes
class
AnnisQueryParser
(
QueryParser
):
"""
Parses search queries for ANNIS-based corpora.
"""
rxTsakorpusBool
=
re
.
compile
(
'
[()|,]
'
)
def
build_get_string
(
self
,
params
,
config
:
ResourceConfig
,
withinClause
=
''
):
"""
Build a payload for an ANNIS search request.
ANNIS uses POST with JSON payload rather than GET, but the
function name is the same as in the other classes for
compatibility.
"""
if
len
(
withinClause
)
>
0
and
withinClause
not
in
(
'
text
'
,
'
session
'
):
raise
Diagnostic
(
DiagnosticTypes
.
sru
,
48
,
message
=
'
ANNIS only supports multi-word search within
'
'
a text (with a default maximum distance of
'
'
50 tokens).
'
)
q
=
{
'
query
'
:
''
,
'
query_language
'
:
'
AQL
'
,
'
corpora
'
:
config
.
annis_corpus_list
,
'
limit
'
:
config
.
max_hits
,
'
order
'
:
'
Randomized
'
}
termIndexes
=
self
.
term_indexes
(
params
)
queryFront
=
''
queryTail
=
''
for
param
in
sorted
(
params
):
if
param
[
0
]
==
'
wf
'
:
queryFront
+=
param
[
2
].
replace
(
'"'
,
''
)
+
'
&
'
else
:
queryTail
+=
'
#
'
+
str
(
param
[
1
])
+
'
'
+
param
[
0
]
+
'
#
'
+
str
(
param
[
2
])
+
'
&
'
q
[
'
query
'
]
=
queryFront
.
strip
(
'
'
)
+
queryTail
.
strip
(
'
&
'
)
return
q
def
term_query
(
self
,
query
:
str
,
config
:
ResourceConfig
):
"""
Return list of query parameters for one term or sequence of terms.
"""
if
len
(
query
)
>=
2
and
query
.
startswith
(
'"'
)
and
query
.
endswith
(
'"'
):
query
=
query
[
1
:
len
(
query
)
-
1
]
if
len
(
query
)
<=
0
:
raise
Diagnostic
(
DiagnosticTypes
.
sru
,
10
)
getParams
=
[]
iTerm
=
0
for
term
in
query
.
split
(
'
'
):
if
len
(
term
)
>
0
:
iTerm
+=
1
getParams
.
append
([
'
wf
'
,
iTerm
,
'"'
+
term
.
replace
(
'"'
,
''
)
+
'"'
])
if
iTerm
>=
2
:
getParams
.
append
([
'
.
'
,
iTerm
,
iTerm
-
1
])
return
getParams
def
binary_bool
(
self
,
strOp
:
str
,
operandL
,
operandR
,
config
):
if
len
(
operandL
)
<=
0
or
len
(
operandR
)
<=
0
:
raise
Diagnostic
(
DiagnosticTypes
.
sru
,
10
)
termsL
=
self
.
term_indexes
(
operandL
)
operandR
=
self
.
shift_term_indexes
(
operandR
,
max
(
termsL
))
termsR
=
self
.
term_indexes
(
operandR
)
if
operandL
[
0
][
0
]
!=
'
wf
'
or
operandR
[
0
][
0
]
!=
'
wf
'
:
raise
Diagnostic
(
DiagnosticTypes
.
sru
,
47
)
if
strOp
==
'
AND
'
:
if
((
len
(
termsL
)
>
1
or
len
(
termsR
)
>
1
)
and
(
any
(
op
[
0
]
not
in
(
'
wf
'
,
'
^*
'
)
for
op
in
operandR
)
or
any
(
op
[
0
]
not
in
(
'
wf
'
,
'
^*
'
)
for
op
in
operandL
))):
message
=
'
ANNIS does not support queries that combine several
'
\
'
multi-word sequences with boolean operators or multiple
'
\
'
boolean operators.
'
raise
Diagnostic
(
DiagnosticTypes
.
sru
,
48
,
message
=
message
)
return
operandL
+
operandR
+
[[
'
^*
'
,
max
(
termsL
),
min
(
termsR
)]]
elif
strOp
==
'
OR
'
:
if
((
len
(
termsL
)
>
1
or
len
(
termsR
)
>
1
)
and
(
any
(
op
[
0
]
not
in
(
'
wf
'
,
'
|
'
)
for
op
in
operandR
)
or
any
(
op
[
0
]
not
in
(
'
wf
'
,
'
|
'
)
for
op
in
operandL
))):
message
=
'
ANNIS does not support queries that combine several
'
\
'
multi-word sequences with boolean operators or multiple
'
\
'
boolean operators.
'
raise
Diagnostic
(
DiagnosticTypes
.
sru
,
48
,
message
=
message
)
return
operandL
+
operandR
+
[[
'
|
'
,
max
(
termsL
),
min
(
termsR
)]]
raise
Diagnostic
(
DiagnosticTypes
.
sru
,
37
,
details
=
strOp
)
def
not_bool
(
self
,
operand
,
config
):
# TODO: implement
raise
NotImplementedError
()
def
adv_term_query_proper
(
self
,
identifier
:
str
,
op
:
str
,
value
:
str
,
flags
:
str
,
config
:
ResourceConfig
):
"""
Return list of query parameters for one term in an advanced query.
"""
flags
=
flags
.
strip
(
'
/
'
)
if
len
(
value
)
<=
0
:
raise
Diagnostic
(
DiagnosticTypes
.
sru
,
10
)
if
flags
not
in
(
''
,
'
I
'
,
'
C
'
):
raise
Diagnostic
(
DiagnosticTypes
.
sru
,
48
,
message
=
'
ANNIS does not support regex flags.
'
)
if
op
!=
'
=
'
:
raise
Diagnostic
(
DiagnosticTypes
.
sru
,
10
,
message
=
'
In token queries, only = is allowed as operators.
'
)
getParams
=
[]
if
identifier
==
'
text
'
:
getParams
.
append
([
'
wf
'
,
1
,
'
/
'
+
value
.
replace
(
'
/
'
,
'
\\
/
'
)
+
'
/
'
])
elif
identifier
==
'
lemma
'
:
getParams
.
append
([
'
lemma
'
,
1
,
'
/
'
+
value
.
replace
(
'
/
'
,
'
\\
/
'
)
+
'
/
'
])
elif
identifier
==
'
pos
'
:
if
value
in
config
.
pos_convert_reverse
:
# UD to corpus-specific POS tags
value
=
config
.
pos_convert_reverse
[
value
]
getParams
.
append
([
'
pos
'
,
1
,
'
/
'
+
value
.
replace
(
'
/
'
,
'
\\
/
'
)
+
'
/
'
])
else
:
getParams
.
append
([
identifier
,
1
,
'
/
'
+
value
.
replace
(
'
/
'
,
'
\\
/
'
)
+
'
/
'
])
# raise Diagnostic(DiagnosticTypes.sru, 10,
# message='The identifier ' + identifier + ' is not supported in ANNIS.')
return
getParams
# TODO: continue here
def
adv_quantify_segment
(
self
,
getParams
,
quantifier
:
str
,
config
:
ResourceConfig
):
if
len
(
getParams
)
!=
1
or
getParams
[
0
][
0
]
!=
'
wf
'
or
getParams
[
0
][
2
]
!=
'
.*
'
:
raise
Diagnostic
(
DiagnosticTypes
.
sru
,
48
,
message
=
'
Token quantifiers are only allowed with empty token queries
'
'
in Tsakoprus (for setting distance constraints).
'
)
minDist
=
1
maxDist
=
100
if
quantifier
==
'
?
'
:
maxDist
=
2
elif
quantifier
==
'
+
'
:
minDist
=
2
elif
self
.
rxQuantifierExact
.
search
(
quantifier
)
is
not
None
:
minDist
=
maxDist
=
int
(
quantifier
[
1
:
len
(
quantifier
)
-
1
])
else
:
m
=
self
.
rxQuantifierInterval
.
search
(
quantifier
)
if
m
is
None
:
raise
Diagnostic
(
DiagnosticTypes
.
sru
,
10
,
message
=
'
Something is wrong with a token quantifier.
'
)
if
len
(
m
.
group
(
1
))
>
0
:
minDist
=
int
(
m
.
group
(
1
))
+
1
if
len
(
m
.
group
(
2
))
>
0
:
maxDist
=
int
(
m
.
group
(
2
))
+
1
getParams
=
[
[
'
word_rel_
'
,
getParams
[
0
][
1
],
getParams
[
0
][
1
]
-
1
],
[
'
word_dist_from_
'
,
getParams
[
0
][
1
],
str
(
minDist
)],
[
'
word_dist_to_
'
,
getParams
[
0
][
1
],
str
(
maxDist
)]
]
return
getParams
def
adv_main_sequence
(
self
,
operandL
,
operandR
,
config
:
ResourceConfig
):
# print('SEQUENCE JOIN', str(operandL), str(operandR))
if
len
(
operandL
)
<=
0
or
len
(
operandR
)
<=
0
:
raise
Diagnostic
(
DiagnosticTypes
.
sru
,
10
)
termsL
=
self
.
term_indexes
(
operandL
)
operandR
=
self
.
shift_term_indexes
(
operandR
,
max
(
termsL
))
termsR
=
self
.
term_indexes
(
operandR
)
# Find out if there is already a distance constraint
wordRelPresent
=
(
any
(
param
[
0
]
==
'
word_rel_
'
for
param
in
operandL
)
or
any
(
param
[
0
]
==
'
word_rel_
'
and
param
[
2
]
==
max
(
termsL
)
for
param
in
operandR
))
if
not
wordRelPresent
:
wordRelParams
=
[
[
'
word_rel_
'
,
min
(
termsR
),
max
(
termsL
)],
[
'
word_dist_from_
'
,
min
(
termsR
),
'
1
'
],
[
'
word_dist_to_
'
,
min
(
termsR
),
'
1
'
]
]
operandR
+=
wordRelParams
return
operandL
+
operandR
def
adv_binary_bool
(
self
,
strOp
:
str
,
operandL
,
operandR
,
config
:
ResourceConfig
):
# Join multiple constraints on one word in an advanced query
print
(
'
ADVANCED INTERNAL BOOL
'
,
strOp
,
str
(
operandL
),
str
(
operandR
))
getParams
=
[]
if
strOp
==
'
&
'
:
strOp
=
'
,
'
paramsR
=
{
paramR
[
0
]
for
paramR
in
operandR
}
for
paramR
in
operandR
:
paramExists
=
False
for
paramL
in
operandL
:
if
paramL
[
0
]
==
paramR
[
0
]:
if
strOp
==
'
,
'
and
paramL
[
0
]
!=
'
gr
'
:
raise
Diagnostic
(
DiagnosticTypes
.
sru
,
48
,
message
=
'
Tsakorpus endpoint does not support conjunctions
'
'
of multiple constraints for the same layer
'
'
within the same word.
'
)
paramExists
=
True
getParams
.
append
([
paramL
[
0
],
paramL
[
1
],
'
(
'
+
paramL
[
2
]
+
'
)
'
+
strOp
+
'
(
'
+
paramR
[
2
]
+
'
)
'
])
if
not
paramExists
:
getParams
.
append
(
paramR
[:])
for
paramL
in
operandL
:
if
paramL
[
0
]
not
in
paramsR
:
if
strOp
==
'
|
'
:
raise
Diagnostic
(
DiagnosticTypes
.
sru
,
48
,
message
=
'
Tsakorpus does not support disjunctions
'
'
of constraints for multiple layers
'
'
within the same word.
'
)
getParams
.
append
(
paramL
[:])
return
getParams
def
send_query
(
self
,
strGetParams
:
str
,
config
:
ResourceConfig
):
"""
Send the translated query to the Tsakorpus instance. Return JSON results
returned by the corpus.
"""
url
=
config
.
resource_base_url
.
strip
(
'
/
'
)
+
'
/search_sent?
'
+
strGetParams
print
(
url
)
response
=
urllib
.
request
.
urlopen
(
url
)
data
=
response
.
read
()
encoding
=
response
.
info
().
get_content_charset
(
'
utf-8
'
)
responseJSON
=
json
.
loads
(
data
.
decode
(
encoding
))
return
responseJSON
if
__name__
==
'
__main__
'
:
pass
This diff is collapsed.
Click to expand it.
common/config.py
+
1
−
0
View file @
ffb2fcd7
...
...
@@ -40,6 +40,7 @@ class ResourceConfig:
self
.
adv_supported
=
False
self
.
supported_layers
=
[]
self
.
resources
=
[]
self
.
annis_corpus_list
=
[]
self
.
search_lang_id
=
''
self
.
pos_convert
=
[]
# corpus-specific to UD (regexes)
self
.
pos_convert_reverse
=
{}
# UD to corpus-specific
...
...
This diff is collapsed.
Click to expand it.
common/litterae_query_parser.py
+
8
−
0
View file @
ffb2fcd7
...
...
@@ -81,6 +81,14 @@ class LitteraeQueryParser(QueryParser):
raise
NotImplementedError
()
def
adv_quantify_segment
(
self
,
getParams
,
quantifier
:
str
,
config
:
ResourceConfig
):
"""
This function is not used as of now.
It is only used in an advanced search, which is switched off for FLC
for now, and implements the only non-trivial advanced capability that
exists in FLC, namely distance constraints (set with the
'
slop_
'
parameters
in the API). If advanced search is enabled for FLC at a future point,
do not forget to take
'
slop_
'
values into account in build_get_string().
"""
if
len
(
getParams
)
!=
1
or
getParams
[
0
][
0
]
!=
'
q_
'
or
getParams
[
0
][
2
]
!=
'
.*
'
:
raise
Diagnostic
(
DiagnosticTypes
.
sru
,
48
,
message
=
'
Token quantifiers are only allowed with empty token queries
'
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment