Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
corpus-services
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Lange, Dr. Herbert
corpus-services
Commits
cc1fc085
Commit
cc1fc085
authored
Apr 16, 2021
by
Aleksandr Riaposov
Browse files
Options
Downloads
Patches
Plain Diff
Spellchecker updated to produce better html output
parent
d547bd40
No related branches found
No related tags found
Loading
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/main/java/de/uni_hamburg/corpora/validation/LanguageToolChecker.java
+58
-62
58 additions, 62 deletions
...e/uni_hamburg/corpora/validation/LanguageToolChecker.java
with
58 additions
and
62 deletions
src/main/java/de/uni_hamburg/corpora/validation/LanguageToolChecker.java
+
58
−
62
View file @
cc1fc085
...
...
@@ -25,15 +25,13 @@ import javax.xml.parsers.ParserConfigurationException;
import
javax.xml.transform.TransformerException
;
import
javax.xml.xpath.XPathExpressionException
;
import
org.xml.sax.SAXException
;
import
org.w3c.dom.Document
;
import
org.w3c.dom.Element
;
import
org.w3c.dom.Node
;
import
org.w3c.dom.NodeList
;
import
org.w3c.dom.Text
;
import
org.exmaralda.partitureditor.jexmaralda.BasicTranscription
;
import
org.exmaralda.partitureditor.jexmaralda.JexmaraldaException
;
import
org.jdom.JDOMException
;
import
org.jdom.Document
;
import
org.jdom.Element
;
import
org.jdom.xpath.XPath
;
import
org.languagetool.rules.RuleMatch
;
import
org.languagetool.JLanguageTool
;
...
...
@@ -70,7 +68,7 @@ public class LanguageToolChecker extends Checker implements CorpusFunction {
*/
@Override
public
Report
function
(
CorpusData
cd
,
Boolean
fix
)
throws
SAXException
,
IOException
,
ParserConfigurationException
,
JexmaraldaException
{
throws
SAXException
,
IOException
,
ParserConfigurationException
,
JexmaraldaException
,
JDOMException
,
XPathExpressionException
,
TransformerException
{
Report
stats
=
new
Report
();
btd
=
new
BasicTranscriptionData
(
cd
.
getURL
());
if
(
language
.
equals
(
"de"
))
{
...
...
@@ -92,64 +90,62 @@ public class LanguageToolChecker extends Checker implements CorpusFunction {
+
language
);
return
stats
;
}
Document
doc
=
TypeConverter
.
JdomDocument2W3cDocument
(
btd
.
getJdom
())
;
NodeList
tiers
=
doc
.
getElementsByTagName
(
"tier"
);
boolean
spellingError
=
false
;
Document
jDoc
=
TypeConverter
.
String2JdomDocument
(
cd
.
toSaveableString
()
);
List
<
RuleMatch
>
matches
=
new
ArrayList
<
RuleMatch
>();
int
count
=
0
;
for
(
int
k
=
0
;
k
<
tiers
.
getLength
();
k
++)
{
Element
tier
=
(
Element
)
tiers
.
item
(
k
);
if
(!
tier
.
getAttribute
(
"category"
).
equals
(
tierToCheck
))
{
String
xpathTier
=
"//tier[@category='"
+
tierToCheck
+
"']"
;
XPath
xTier
=
XPath
.
newInstance
(
xpathTier
);
List
tierList
=
xTier
.
selectNodes
(
jDoc
);
//extra for loop to get the tier id value for exmaError
for
(
int
i
=
0
;
i
<
tierList
.
size
();
i
++)
{
Object
oTier
=
tierList
.
get
(
i
);
if
(
oTier
instanceof
Element
)
{
Element
tier
=
(
Element
)
oTier
;
String
tierId
=
tier
.
getAttributeValue
(
"id"
);
String
xpathEvent
=
"//tier[@id='"
+
tierId
+
"']/event"
;
XPath
xEvent
=
XPath
.
newInstance
(
xpathEvent
);
List
eventList
=
xEvent
.
selectNodes
(
tier
);
for
(
int
j
=
0
;
j
<
eventList
.
size
();
j
++)
{
Object
o
=
eventList
.
get
(
j
);
if
(
o
instanceof
Element
)
{
Element
e
=
(
Element
)
o
;
String
eventText
=
e
.
getText
();
String
start
=
e
.
getAttributeValue
(
"start"
);
matches
=
langTool
.
check
(
eventText
);
String
xpathStart
=
"//tier[@category='ref']/event[@start='"
+
start
+
"']"
;
XPath
xpathRef
=
XPath
.
newInstance
(
xpathStart
);
List
refList
=
xpathRef
.
selectNodes
(
jDoc
);
if
(
refList
.
isEmpty
())
{
String
emptyMessage
=
"Ref tier information seems to be missing for event '"
+
eventText
+
"'"
;
stats
.
addCritical
(
function
,
cd
,
emptyMessage
);
exmaError
.
addError
(
function
,
cd
.
getURL
().
getFile
(),
tierId
,
start
,
false
,
emptyMessage
);
continue
;
}
NodeList
events
=
tier
.
getElementsByTagName
(
"event"
);
for
(
int
i
=
0
;
i
<
events
.
getLength
();
i
++)
{
Element
event
=
(
Element
)
events
.
item
(
i
);
NodeList
eventTexts
=
event
.
getChildNodes
();
for
(
int
j
=
0
;
j
<
eventTexts
.
getLength
();
j
++)
{
Node
maybeText
=
eventTexts
.
item
(
j
);
if
(
maybeText
.
getNodeType
()
!=
Node
.
TEXT_NODE
)
{
if
(
maybeText
.
getNodeType
()
==
Node
.
ELEMENT_NODE
&&
maybeText
.
getNodeName
().
equals
(
"ud-information"
))
{
// XXX: ud-information is weird I'll just skip it...
continue
;
}
System
.
out
.
println
(
"This is not a text node: "
+
maybeText
);
continue
;
}
Text
eventText
=
(
Text
)
maybeText
;
String
text
=
eventText
.
getWholeText
();
matches
=
langTool
.
check
(
text
);
Object
refObj
=
refList
.
get
(
0
);
if
(
refObj
instanceof
Element
)
{
Element
refEl
=
(
Element
)
refObj
;
String
refText
=
refEl
.
getText
();
for
(
RuleMatch
match
:
matches
)
{
String
message
=
"Potential error at characters "
+
match
.
getFromPos
()
+
"-"
+
match
.
getToPos
()
+
": "
+
match
.
getMessage
()
+
": \""
+
t
ext
.
substring
(
match
.
getFromPos
(),
+
eventT
ext
.
substring
(
match
.
getFromPos
(),
match
.
getToPos
())
+
"\" "
+
"Suggested correction(s): "
+
match
.
getSuggestedReplacements
();
stats
.
addWarning
(
function
,
cd
,
message
);
// System.out.println("Potential error at characters " +
// match.getFromPos() + "-" + match.getToPos() + ": " +
// match.getMessage() + ": \"" +
// text.substring(match.getFromPos(),
// match.getToPos()) + "\" " +
// "Suggested correction(s): " +
// match.getSuggestedReplacements());
//add ExmaError tierID eventID
exmaError
.
addError
(
function
,
cd
.
getURL
().
getFile
(),
tier
.
getAttribute
(
"id"
),
event
.
getAttribute
(
"start"
),
false
,
message
);
}
if
(!
matches
.
isEmpty
())
{
count
++;
+
match
.
getSuggestedReplacements
()
+
". Reference tier id: "
+
refText
;
spellingError
=
true
;
stats
.
addWarning
(
function
,
cd
,
message
);
exmaError
.
addError
(
function
,
cd
.
getURL
().
getFile
(),
tierId
,
start
,
false
,
message
);
}
}
}
}
if
(
count
==
0
)
{
if
(!
spellingError
)
{
stats
.
addCorrect
(
function
,
cd
,
"No spelling errors found."
);
}
}
}
return
stats
;
}
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment