Skip to content
Snippets Groups Projects
Commit cb2cedb0 authored by Lange, Dr. Herbert's avatar Lange, Dr. Herbert
Browse files

initial commit

parents
Branches
No related tags found
No related merge requests found
pom.xml 0 → 100644
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>de.uni-hamburg.corpora</groupId>
<artifactId>Xml2Paths</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>jdom</groupId>
<artifactId>jdom</artifactId>
<version>1.1</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.11.0</version>
</dependency>
</dependencies>
<properties>
<maven.compiler.source>11</maven.compiler.source>
<maven.compiler.target>11</maven.compiler.target>
</properties>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.6.2</version>
<configuration>
<source>11</source>
<target>11</target>
</configuration>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<version>1.2.1</version>
<executions>
<execution>
<goals>
<goal>java</goal>
</goals>
</execution>
</executions>
<configuration>
<mainClass>de.uni_hamburg.corpora.Xml2Paths</mainClass>
</configuration>
</plugin>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<archive>
<manifest>
<mainClass>de.uni_hamburg.corpora.Xml2Paths</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<appendAssemblyId>false</appendAssemblyId>
</configuration>
</plugin>
</plugins>
</build>
</project>
\ No newline at end of file
package de.uni_hamburg.corpora;
import org.apache.commons.io.FileSystem;
import org.jdom.*;
import org.jdom.input.SAXBuilder;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.logging.Logger;
import java.util.stream.Collectors;
import java.util.Map;
import org.apache.commons.io.FilenameUtils;
/**
* The type Xml 2 paths.
* @author bba1792, Dr. Herbert Lange
* @version 20210831
*/
public class Xml2Paths {
public Xml2Paths() {
}
private static Logger logger = Logger.getLogger(Xml2Paths.class.toString());
// Actually runs out of memory
// public static List<List<String>> getPaths(Element e) {
// LinkedList<Pair<List<String>,Element>> frontier = new LinkedList<>(Collections.singletonList(new Pair<>(Collections.singletonList(""),e)));
// List<List<String>> paths = new ArrayList<>();
// while (!frontier.isEmpty()) {
// Pair<List<String>,Element> current = frontier.getFirst();
// LinkedList<String> currentPath = new LinkedList<>(current.first());
// currentPath.addLast(current.second().getName());
// for (Element c : (List<Element>) current.second().getChildren()) {
// frontier.add(new Pair<>(currentPath,c));
// }
// paths.add(currentPath);
// }
// return paths;
// }
public static List<List<String>> descend(List<String> prefix, Element e) {
List<List<String>> result = new ArrayList<>();
List<String> nPrefix = new ArrayList<>(prefix);
nPrefix.add(e.getName());
result.add(nPrefix);
for (Attribute a : (List<Attribute>) e.getAttributes()) {
List<String> tmp = new ArrayList<>(nPrefix);
tmp.add("@"+a.getName());
result.add(tmp);
}
for (Element child : (List<Element>) e.getChildren()) {
result.addAll(descend(nPrefix,child));
}
return result;
}
private static List<List<String>> file2paths(URL file) {
try {
SAXBuilder builder = new SAXBuilder();
Document d = builder.build(file);
return descend(Collections.EMPTY_LIST, d.getRootElement());
} catch (JDOMException | IOException e) {
e.printStackTrace();
}
return Collections.EMPTY_LIST;
}
public static void main(String[] args) {
if (args.length < 1) {
Properties p = System.getProperties();
System.out.println("Usage: " + p.get("sun.java.command") + " file-name|file-path [file-extension[,file-extension]+] [-count]");
} else {
boolean count = false ;
// Only used when count is true
Map<String,Integer> pathCounts = new HashMap<>();
List<List<String>> paths = new ArrayList<>();
// Default extension is XML
List<String> extensions = Collections.singletonList("xml");
// Can be overridden by command line argument
if (args.length >= 2 && !args[1].contains("-count")) {
extensions = Arrays.asList(args[1].split(","));
}
System.out.println("Extensions: " + extensions);
if ((args.length == 2 && args[1].contains("-count")) || (args.length == 3 && args[2].contains("-count"))) {
count = true ;
}
File argFile = Path.of(args[0]).toFile();
try {
if (argFile.isFile()) {
paths = file2paths(argFile.toURI().toURL());
} else if (argFile.isDirectory()) {
List<String> finalExtensions = extensions;
List<Path> allFiles = Files.walk(Paths.get(argFile.toURI())).filter((f) -> Files.isRegularFile(f) && finalExtensions.contains(FilenameUtils.getExtension(f.toString()))).collect(Collectors.toList());
for (Path p : allFiles) {
logger.info("Reading " + p);
paths.addAll(file2paths(p.toFile().toURI().toURL()));
}
}
else {
System.err.println("File or path not found: "+ args[0]);
System.exit(-1);
}
logger.info("Creating output");
// Too slow
//System.out.println(paths.stream().map((l) -> l.stream().reduce((s1, s2) -> s1 + "/" + s2)).reduce((s1, s2) -> java.util.Optional.of(s1.get() + "\n/" + s2.get())));
for (List<String> p : paths) {
String path = "/" + String.join("/",p);
if (count) {
pathCounts.compute(path,(k, v) -> (v == null) ? 1 : v + 1);
}
else
System.out.println(path);
}
if (count) {
List<Map.Entry<String, Integer>> list = new ArrayList<>(pathCounts.entrySet());
System.out.println(pathCounts.size());
list.sort(Map.Entry.comparingByValue());
System.out.println(list.size());
list.forEach(e -> System.out.println(e.getValue() + "\t" + e.getKey()));
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
private static class Pair<T, T1> {
T v ;
T1 v1 ;
Pair (T v, T1 v1) {
this.v = v;
this.v1 = v1;
}
T first() {
return v;
}
T1 second() {
return v1;
}
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment