1、递归处理xml文件的方法
使用dom4j依赖为:
<dependency>
<groupId>dom4j</groupId>
<artifactId>dom4j</artifactId>
<version>1.6.1</version>
</dependency>
<dependency>
<groupId>jaxen</groupId>
<artifactId>jaxen</artifactId>
<version>1.1.6</version>
</dependency>
函数方法:
public class XmlParse {
public static void main(String[] args) throws Exception {
SAXReader saxReader = new SAXReader();
Document document = saxReader.read(new File("C:\\Users\\40275\\Desktop\\2triples_Astronaut_dev_challenge.xml"));
treeWalk(document);
// // 获取根元素
// Element root = document.getRootElement();
//
// List<Element> childElements = root.elements();
// for (Element child : childElements) {
// List<Element> secondChild = child.elements();
// for (Element sec : secondChild) {
//
// List<Element> originaltripleset = sec.elements("originaltripleset");
// for (Element ele : originaltripleset) {
// String stringValue = ele.getStringValue();
// System.out.println(stringValue.trim());
// }
//
// List<Element> modifiedtripleset = sec.elements("modifiedtripleset");
// for (Element ele : modifiedtripleset) {
// String stringValue = ele.getStringValue();
// System.out.println(stringValue.trim());
// }
//
// List<Element> lex = sec.elements("lex");
// for (Element ele : lex) {
// String stringValue = ele.getStringValue();
// System.out.println(stringValue.trim());
// }
// }
// }
}
public static void treeWalk(Document document) {
treeWalk(document.getRootElement());
}
public static void treeWalk(Element element) {
for (int i = 0, size = element.nodeCount(); i < size; i++) {
Node node = element.node(i);
if (node instanceof Element) {
System.out.println(node.getName());
treeWalk((Element) node);
} else {
// do something…
// System.out.println("do something");
System.out.println(node.getStringValue().trim());
}
}
}
}
2、JSON文件的处理
json依赖
<dependency>
<groupId>org.json</groupId>
<artifactId>json</artifactId>
<version>20160810</version>
</dependency>
<!--加入对commons-io的依赖-->
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.4</version>
</dependency>
根据json文件的唯一标签作为key,json字符串作为value处理json文件
public static void validAndTrained() throws IOException {
String trainJsonFilePath = "C:\\Users\\40275\\Desktop\\traindev.json";
String validOutJsonFilePath = "C:\\Users\\40275\\Desktop\\train_out.json";
Map<String,JSONObject> traindevMap = new HashMap<String, JSONObject>();
Map<String,JSONObject> validOutMap = new HashMap<String, JSONObject>();
mapJsonFileToMap(trainJsonFilePath, traindevMap);
mapJsonFileToMapCos(validOutJsonFilePath,validOutMap);
System.out.println("traindev "+traindevMap.size());
System.out.println("valid_out "+validOutMap.size());
Set<String> sumKeys = new HashSet<String>(traindevMap.keySet());
OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream(new File("d:\\differ.txt")),"UTF-8");
BufferedWriter bufferedWriter = new BufferedWriter(osw);
//取交集,注释部分为
int count = 0; //交集个数
for (String devKey : validOutMap.keySet()){
if (sumKeys.contains(devKey)){
count++;
//将交集输出到文件
/*JSONObject jsonObject = traindevMap.get(devKey);
String string = jsonObject.toString();
bufferedWriter.write(string+"\t\n");*/
}else {
//输出不包含的key,测试用
// System.out.println(devKey);
// System.out.println();
// System.out.println();
// bufferedWriter.append(devKey);
// bufferedWriter.append("\t\n");
// bufferedWriter.append("\t\n");
}
bufferedWriter.flush();
}
System.out.println(count);
}
/**
* 用于没有逗号
* @param filePath
* @param jsonMap
* @throws IOException
*/
private static void mapJsonFileToMap(String filePath, Map<String, JSONObject> jsonMap) throws IOException {
File sumJsonFile = new File(filePath);
FileInputStream fis = new FileInputStream(sumJsonFile);
BufferedReader reader = new BufferedReader(new InputStreamReader(fis));
String oneLine = null;
while((oneLine = reader.readLine()) != null){
JSONObject jsonObject = new JSONObject(oneLine);
String key = (String) jsonObject.get("sentText");
if (jsonMap.containsKey(key)){
continue;
}
jsonMap.put(key,jsonObject);
}
reader.close();
fis.close();
}