Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
F
fcs-clarin-endpoint-hamburg
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Arkhangelskiy, Timofey
fcs-clarin-endpoint-hamburg
Commits
5ff4a239
Commit
5ff4a239
authored
2 years ago
by
Arkhangelskiy, Timofey
Browse files
Options
Downloads
Patches
Plain Diff
Continue with advanced search parsing (not ready yet)
parent
241542fc
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
common/query_parser.py
+128
-0
128 additions, 0 deletions
common/query_parser.py
common/tsakorpus_query_parser.py
+23
-0
23 additions, 0 deletions
common/tsakorpus_query_parser.py
with
151 additions
and
0 deletions
common/query_parser.py
+
128
−
0
View file @
5ff4a239
...
...
@@ -17,6 +17,10 @@ class QueryParser:
rxWithinClause
=
re
.
compile
(
'
+within +(s|sentence|u|utterance|p|paragraph|
'
'
t|turn|text|session) *$
'
)
rxNonemptyQueryPart
=
re
.
compile
(
'
[^
\t\r\n
]
'
)
rxSegmentQuery
=
re
.
compile
(
'
^
\\
[(.*)
\\
](
\\
{[0-9,]
\\
}|[?*+]|)$
'
)
rxAdvTermQuery
=
re
.
compile
(
'
^ *([a-zA-Z][a-zA-Z0-9\-]*(?::[a-zA-Z][a-zA-Z0-9\-]*)?) *
'
'
(!?=) *([
"
\'
](.*)[
"
\'
]) *(/[iIcCld])? *$
'
)
acceptableIdentifiers
=
{
'
text
'
,
'
lemma
'
,
'
pos
'
,
'
orth
'
,
'
norm
'
,
'
phonetic
'
}
def
__init__
(
self
):
pass
...
...
@@ -63,6 +67,7 @@ class QueryParser:
bracketBalance
=
0
curlyBalance
=
0
inQuotes
=
False
inSingleQuotes
=
False
for
i
in
range
(
start
,
end
):
if
inQuotes
:
if
strQuery
[
i
]
==
'"'
and
i
>
0
and
strQuery
[
i
-
1
]
!=
'
\\
'
:
...
...
@@ -71,6 +76,13 @@ class QueryParser:
if
strQuery
[
i
]
==
'"'
and
i
>
0
and
strQuery
[
i
-
1
]
!=
'
\\
'
:
inQuotes
=
True
continue
if
inSingleQuotes
:
if
strQuery
[
i
]
==
"'"
and
i
>
0
and
strQuery
[
i
-
1
]
!=
'
\\
'
:
inSingleQuotes
=
False
continue
if
strQuery
[
i
]
==
"'"
and
i
>
0
and
strQuery
[
i
-
1
]
!=
'
\\
'
:
inSingleQuotes
=
True
continue
if
strQuery
[
i
]
==
'
(
'
:
parenthBalance
+=
1
elif
strQuery
[
i
]
==
'
)
'
:
...
...
@@ -98,6 +110,47 @@ class QueryParser:
return
iCurChar
-
1
,
'
SEQUENCE
'
return
-
1
,
''
@staticmethod
def
find_operator_adv_expression
(
strQuery
):
"""
Locate the highest |, & or ! operator in a segment expression
in the advanced search.
"""
start
=
0
end
=
len
(
strQuery
)
while
start
<
len
(
strQuery
)
and
strQuery
[
start
]
in
'
\t\n
'
:
start
+=
1
while
end
>
0
and
strQuery
[
end
-
1
]
in
'
\t\n
'
:
end
-=
1
if
strQuery
[
start
]
==
'
!
'
:
return
start
,
'
!
'
parenthBalance
=
0
inQuotes
=
False
inSingleQuotes
=
False
for
i
in
range
(
start
,
end
):
if
inQuotes
:
if
strQuery
[
i
]
==
'"'
and
i
>
0
and
strQuery
[
i
-
1
]
!=
'
\\
'
:
inQuotes
=
False
continue
if
strQuery
[
i
]
==
'"'
and
i
>
0
and
strQuery
[
i
-
1
]
!=
'
\\
'
:
inQuotes
=
True
continue
if
inSingleQuotes
:
if
strQuery
[
i
]
==
"'"
and
i
>
0
and
strQuery
[
i
-
1
]
!=
'
\\
'
:
inSingleQuotes
=
False
continue
if
strQuery
[
i
]
==
"'"
and
i
>
0
and
strQuery
[
i
-
1
]
!=
'
\\
'
:
inSingleQuotes
=
True
continue
if
strQuery
[
i
]
==
'
(
'
:
parenthBalance
+=
1
elif
strQuery
[
i
]
==
'
)
'
:
parenthBalance
-=
1
elif
parenthBalance
==
0
:
if
strQuery
[
i
]
in
'
|&
'
:
return
i
,
strQuery
[
i
]
return
-
1
,
''
@staticmethod
def
shift_term_indexes
(
getParams
,
shift
):
"""
...
...
@@ -146,6 +199,35 @@ class QueryParser:
# Abstract function
raise
NotImplementedError
()
def
adv_term_query_proper
(
self
,
identifier
:
str
,
op
:
str
,
value
:
str
,
flags
:
str
,
config
):
# Abstract function
raise
NotImplementedError
()
def
adv_term_query
(
self
,
query
,
config
:
ResourceConfig
):
m
=
self
.
rxAdvTermQuery
.
search
(
query
)
if
m
is
None
:
raise
Diagnostic
(
DiagnosticTypes
.
sru
,
10
)
identifier
,
op
,
value
,
flags
=
m
.
group
(
1
),
m
.
group
(
2
),
m
.
group
(
3
),
m
.
group
(
4
)
if
value
[
0
]
!=
value
[
-
1
]:
raise
Diagnostic
(
DiagnosticTypes
.
sru
,
10
)
# Different quotes
value
=
value
[
1
:
len
(
value
)
-
1
]
# Remove quotes
if
flags
is
None
:
flags
=
''
if
identifier
in
(
'
token
'
,
'
word
'
):
identifier
=
'
text
'
# Should I do this?
if
identifier
not
in
self
.
acceptableIdentifiers
:
raise
Diagnostic
(
DiagnosticTypes
.
sru
,
10
,
message
=
identifier
+
'
is not an acceptable identifier in a segment query.
'
)
return
self
.
adv_term_query_proper
(
identifier
,
op
,
value
,
flags
,
config
)
def
adv_binary_bool
(
self
,
strOp
,
operandL
,
operandR
,
config
):
# Abstract function
raise
NotImplementedError
()
def
adv_not_bool
(
self
,
operand
,
config
):
# Abstract function
raise
NotImplementedError
()
def
translate_simple
(
self
,
query
:
str
,
config
:
ResourceConfig
,
start
=
0
,
end
=-
1
):
"""
Translate a simple search (CQL) query into a corpus-specific query
...
...
@@ -199,6 +281,52 @@ class QueryParser:
return
self
.
not_bool
(
resultRight
,
config
)
return
{}
def
adv_expression_query
(
self
,
query
:
str
,
config
:
ResourceConfig
):
iOpPos
,
strOp
=
self
.
find_operator_adv_expression
(
query
)
if
iOpPos
==
-
1
:
if
query
[
0
]
==
'
(
'
and
query
[
-
1
]
==
'
)
'
:
return
self
.
adv_expression_query
(
query
[
1
:
len
(
query
)
-
1
],
config
)
else
:
return
self
.
adv_term_query
(
query
,
config
)
if
strOp
in
(
'
&
'
,
'
|
'
):
resultLeft
=
self
.
adv_expression_query
(
query
[:
iOpPos
],
config
)
resultRight
=
self
.
adv_expression_query
(
query
[
iOpPos
+
1
:],
config
)
if
len
(
resultLeft
)
<=
0
or
len
(
resultRight
)
<=
0
:
raise
Diagnostic
(
DiagnosticTypes
.
sru
,
10
)
return
self
.
adv_binary_bool
(
strOp
,
resultLeft
,
resultRight
,
config
)
elif
strOp
==
'
!
'
:
resultRight
=
self
.
adv_expression_query
(
query
[
iOpPos
+
1
:],
config
)
return
self
.
not_bool
(
resultRight
,
config
)
def
adv_segment_query
(
self
,
query
:
str
,
config
:
ResourceConfig
):
m
=
self
.
rxSegmentQuery
.
search
(
query
)
if
m
is
None
:
raise
Diagnostic
(
DiagnosticTypes
.
sru
,
27
)
expression
=
m
.
group
(
1
).
strip
()
quantifier
=
m
.
group
(
2
)
# TODO: quantifier
return
self
.
adv_expression_query
(
expression
,
config
)
def
adv_simple_query
(
self
,
query
:
str
,
config
:
ResourceConfig
,
start
=
0
,
end
=-
1
):
if
len
(
query
)
<=
0
:
raise
Diagnostic
(
DiagnosticTypes
.
sru
,
27
)
if
start
>=
len
(
query
)
-
1
or
end
<=
0
:
raise
Diagnostic
(
DiagnosticTypes
.
sru
,
10
)
while
start
<
len
(
query
)
and
query
[
start
]
in
'
\t\n
'
:
start
+=
1
while
end
>
0
and
query
[
end
-
1
]
in
'
\t\n
'
:
end
-=
1
if
start
>=
end
:
raise
Diagnostic
(
DiagnosticTypes
.
sru
,
10
)
if
query
[
start
]
==
'
(
'
and
query
[
end
]
==
'
)
'
:
return
self
.
adv_main_query
(
query
,
config
,
start
=
start
+
1
,
end
=
end
-
1
)
if
(
query
[
end
-
1
]
!=
'
\\
'
and
((
query
[
start
]
==
'"'
and
query
[
end
]
==
'"'
)
or
(
query
[
start
]
==
"'"
and
query
[
end
]
==
"'"
))):
return
self
.
adv_segment_query
(
'
[text=
'
+
query
[
start
:
end
]
+
'
]
'
,
config
)
return
self
.
adv_segment_query
(
query
[
start
:
end
],
config
)
def
adv_main_query
(
self
,
query
:
str
,
config
:
ResourceConfig
,
start
=
0
,
end
=-
1
):
if
len
(
query
)
<=
0
:
raise
Diagnostic
(
DiagnosticTypes
.
sru
,
27
)
...
...
This diff is collapsed.
Click to expand it.
common/tsakorpus_query_parser.py
+
23
−
0
View file @
5ff4a239
...
...
@@ -84,6 +84,29 @@ class TsakorpusQueryParser(QueryParser):
# TODO: implement
raise
NotImplementedError
()
def
adv_term_query_proper
(
self
,
identifier
:
str
,
op
:
str
,
value
:
str
,
flags
:
str
,
config
:
ResourceConfig
):
"""
Return list of query parameters for one term in an advanced query.
"""
if
len
(
value
)
<=
0
:
return
Diagnostic
(
DiagnosticTypes
.
sru
,
10
)
if
flags
not
in
(
''
,
'
i
'
,
'
c
'
):
return
Diagnostic
(
DiagnosticTypes
.
sru
,
10
,
message
=
'
Tsakorpus does not support regex flags.
'
)
getParams
=
[]
if
identifier
==
'
text
'
:
getParams
.
append
([
'
wf
'
,
0
,
value
])
elif
identifier
==
'
lemma
'
:
getParams
.
append
([
'
lex
'
,
0
,
value
])
elif
identifier
==
'
pos
'
:
if
value
in
config
.
pos_convert_reverse
:
# UD to corpus-specific POS tags
value
=
config
.
pos_convert_reverse
[
value
]
getParams
.
append
([
'
gr
'
,
0
,
value
])
else
:
return
Diagnostic
(
DiagnosticTypes
.
sru
,
10
,
message
=
'
The identifier
'
+
identifier
+
'
is not supported in Tsakoprus.
'
)
return
getParams
def
send_query
(
self
,
strGetParams
:
str
,
config
:
ResourceConfig
):
"""
Send the translated query to the Tsakorpus instance. Return JSON results
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment