Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
X
xml2paths
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Lange, Dr. Herbert
xml2paths
Commits
cb2cedb0
Commit
cb2cedb0
authored
Sep 28, 2021
by
Lange, Dr. Herbert
Browse files
Options
Downloads
Patches
Plain Diff
initial commit
parents
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
pom.xml
+72
-0
72 additions, 0 deletions
pom.xml
src/main/java/de/uni_hamburg/corpora/Xml2Paths.java
+146
-0
146 additions, 0 deletions
src/main/java/de/uni_hamburg/corpora/Xml2Paths.java
with
218 additions
and
0 deletions
pom.xml
0 → 100644
+
72
−
0
View file @
cb2cedb0
<?xml version="1.0" encoding="UTF-8"?>
<project
xmlns=
"http://maven.apache.org/POM/4.0.0"
xmlns:xsi=
"http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation=
"http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
>
<modelVersion>
4.0.0
</modelVersion>
<groupId>
de.uni-hamburg.corpora
</groupId>
<artifactId>
Xml2Paths
</artifactId>
<version>
1.0-SNAPSHOT
</version>
<dependencies>
<dependency>
<groupId>
jdom
</groupId>
<artifactId>
jdom
</artifactId>
<version>
1.1
</version>
</dependency>
<dependency>
<groupId>
commons-io
</groupId>
<artifactId>
commons-io
</artifactId>
<version>
2.11.0
</version>
</dependency>
</dependencies>
<properties>
<maven.compiler.source>
11
</maven.compiler.source>
<maven.compiler.target>
11
</maven.compiler.target>
</properties>
<build>
<plugins>
<plugin>
<groupId>
org.apache.maven.plugins
</groupId>
<artifactId>
maven-compiler-plugin
</artifactId>
<version>
3.6.2
</version>
<configuration>
<source>
11
</source>
<target>
11
</target>
</configuration>
</plugin>
<plugin>
<groupId>
org.codehaus.mojo
</groupId>
<artifactId>
exec-maven-plugin
</artifactId>
<version>
1.2.1
</version>
<executions>
<execution>
<goals>
<goal>
java
</goal>
</goals>
</execution>
</executions>
<configuration>
<mainClass>
de.uni_hamburg.corpora.Xml2Paths
</mainClass>
</configuration>
</plugin>
<plugin>
<artifactId>
maven-assembly-plugin
</artifactId>
<configuration>
<archive>
<manifest>
<mainClass>
de.uni_hamburg.corpora.Xml2Paths
</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>
jar-with-dependencies
</descriptorRef>
</descriptorRefs>
<appendAssemblyId>
false
</appendAssemblyId>
</configuration>
</plugin>
</plugins>
</build>
</project>
\ No newline at end of file
This diff is collapsed.
Click to expand it.
src/main/java/de/uni_hamburg/corpora/Xml2Paths.java
0 → 100644
+
146
−
0
View file @
cb2cedb0
package
de.uni_hamburg.corpora
;
import
org.apache.commons.io.FileSystem
;
import
org.jdom.*
;
import
org.jdom.input.SAXBuilder
;
import
java.io.File
;
import
java.io.IOException
;
import
java.net.MalformedURLException
;
import
java.net.URI
;
import
java.net.URL
;
import
java.nio.file.Files
;
import
java.nio.file.Path
;
import
java.nio.file.Paths
;
import
java.util.*
;
import
java.util.logging.Logger
;
import
java.util.stream.Collectors
;
import
java.util.Map
;
import
org.apache.commons.io.FilenameUtils
;
/**
* The type Xml 2 paths.
* @author bba1792, Dr. Herbert Lange
* @version 20210831
*/
public
class
Xml2Paths
{
public
Xml2Paths
()
{
}
private
static
Logger
logger
=
Logger
.
getLogger
(
Xml2Paths
.
class
.
toString
());
// Actually runs out of memory
// public static List<List<String>> getPaths(Element e) {
// LinkedList<Pair<List<String>,Element>> frontier = new LinkedList<>(Collections.singletonList(new Pair<>(Collections.singletonList(""),e)));
// List<List<String>> paths = new ArrayList<>();
// while (!frontier.isEmpty()) {
// Pair<List<String>,Element> current = frontier.getFirst();
// LinkedList<String> currentPath = new LinkedList<>(current.first());
// currentPath.addLast(current.second().getName());
// for (Element c : (List<Element>) current.second().getChildren()) {
// frontier.add(new Pair<>(currentPath,c));
// }
// paths.add(currentPath);
// }
// return paths;
// }
public
static
List
<
List
<
String
>>
descend
(
List
<
String
>
prefix
,
Element
e
)
{
List
<
List
<
String
>>
result
=
new
ArrayList
<>();
List
<
String
>
nPrefix
=
new
ArrayList
<>(
prefix
);
nPrefix
.
add
(
e
.
getName
());
result
.
add
(
nPrefix
);
for
(
Attribute
a
:
(
List
<
Attribute
>)
e
.
getAttributes
())
{
List
<
String
>
tmp
=
new
ArrayList
<>(
nPrefix
);
tmp
.
add
(
"@"
+
a
.
getName
());
result
.
add
(
tmp
);
}
for
(
Element
child
:
(
List
<
Element
>)
e
.
getChildren
())
{
result
.
addAll
(
descend
(
nPrefix
,
child
));
}
return
result
;
}
private
static
List
<
List
<
String
>>
file2paths
(
URL
file
)
{
try
{
SAXBuilder
builder
=
new
SAXBuilder
();
Document
d
=
builder
.
build
(
file
);
return
descend
(
Collections
.
EMPTY_LIST
,
d
.
getRootElement
());
}
catch
(
JDOMException
|
IOException
e
)
{
e
.
printStackTrace
();
}
return
Collections
.
EMPTY_LIST
;
}
public
static
void
main
(
String
[]
args
)
{
if
(
args
.
length
<
1
)
{
Properties
p
=
System
.
getProperties
();
System
.
out
.
println
(
"Usage: "
+
p
.
get
(
"sun.java.command"
)
+
" file-name|file-path [file-extension[,file-extension]+] [-count]"
);
}
else
{
boolean
count
=
false
;
// Only used when count is true
Map
<
String
,
Integer
>
pathCounts
=
new
HashMap
<>();
List
<
List
<
String
>>
paths
=
new
ArrayList
<>();
// Default extension is XML
List
<
String
>
extensions
=
Collections
.
singletonList
(
"xml"
);
// Can be overridden by command line argument
if
(
args
.
length
>=
2
&&
!
args
[
1
].
contains
(
"-count"
))
{
extensions
=
Arrays
.
asList
(
args
[
1
].
split
(
","
));
}
System
.
out
.
println
(
"Extensions: "
+
extensions
);
if
((
args
.
length
==
2
&&
args
[
1
].
contains
(
"-count"
))
||
(
args
.
length
==
3
&&
args
[
2
].
contains
(
"-count"
)))
{
count
=
true
;
}
File
argFile
=
Path
.
of
(
args
[
0
]).
toFile
();
try
{
if
(
argFile
.
isFile
())
{
paths
=
file2paths
(
argFile
.
toURI
().
toURL
());
}
else
if
(
argFile
.
isDirectory
())
{
List
<
String
>
finalExtensions
=
extensions
;
List
<
Path
>
allFiles
=
Files
.
walk
(
Paths
.
get
(
argFile
.
toURI
())).
filter
((
f
)
->
Files
.
isRegularFile
(
f
)
&&
finalExtensions
.
contains
(
FilenameUtils
.
getExtension
(
f
.
toString
()))).
collect
(
Collectors
.
toList
());
for
(
Path
p
:
allFiles
)
{
logger
.
info
(
"Reading "
+
p
);
paths
.
addAll
(
file2paths
(
p
.
toFile
().
toURI
().
toURL
()));
}
}
else
{
System
.
err
.
println
(
"File or path not found: "
+
args
[
0
]);
System
.
exit
(-
1
);
}
logger
.
info
(
"Creating output"
);
// Too slow
//System.out.println(paths.stream().map((l) -> l.stream().reduce((s1, s2) -> s1 + "/" + s2)).reduce((s1, s2) -> java.util.Optional.of(s1.get() + "\n/" + s2.get())));
for
(
List
<
String
>
p
:
paths
)
{
String
path
=
"/"
+
String
.
join
(
"/"
,
p
);
if
(
count
)
{
pathCounts
.
compute
(
path
,(
k
,
v
)
->
(
v
==
null
)
?
1
:
v
+
1
);
}
else
System
.
out
.
println
(
path
);
}
if
(
count
)
{
List
<
Map
.
Entry
<
String
,
Integer
>>
list
=
new
ArrayList
<>(
pathCounts
.
entrySet
());
System
.
out
.
println
(
pathCounts
.
size
());
list
.
sort
(
Map
.
Entry
.
comparingByValue
());
System
.
out
.
println
(
list
.
size
());
list
.
forEach
(
e
->
System
.
out
.
println
(
e
.
getValue
()
+
"\t"
+
e
.
getKey
()));
}
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
}
}
private
static
class
Pair
<
T
,
T1
>
{
T
v
;
T1
v1
;
Pair
(
T
v
,
T1
v1
)
{
this
.
v
=
v
;
this
.
v1
=
v1
;
}
T
first
()
{
return
v
;
}
T1
second
()
{
return
v1
;
}
}
}
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment