第一次写-如何用SAX高效解析超大XML文件

时间:2021-12-27 03:56:51
最近用到xml文件想把它读取之后存到数据库里面,但是在网上找了一些方法,发现不是会出现文件过大导致堆溢出的问题,就是效率太低,几百万条数据需要好几个小时甚至一两天,于是我就捉摸着自己写了一个。是用SAX解析的,存储的数据库为SQLserver2012,最终对于将近3个G包含几百万条数据的xml文件,解析时间大概二三十分钟吧,效率还可以,当然还有待改进,没有用到多线程的方法,毕竟是初学者,很多东西都是自己瞎捉摸的,希望大家多多指点。
package xmlReaderSecond;

import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.sql.Timestamp;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;

import xmlReaderSecond.BeanListHandler;
import xmlReaderSecond.DBconnect;
import xmlReaderSecond.Nodes;
import xmlReaderSecond.Way;
import xmlReaderSecond.XMLreadersecond;

public class XMLreadersecond {
/*
* fuction:用于解析超大Xml文件并存储
* author:hz
* time:2016.08.09.22:00
*/


//。。。。。。。。。。。。。。。。下面为第三种方法,能解决特大文件读取的问题,并且效率很高。。。。。。。。。。。。。//

public void readPolyAndPointXml() throws ParserConfigurationException, SAXException, IOException{
// 创建解析工厂
SAXParserFactory factory = SAXParserFactory.newInstance();
// 创建解析器
SAXParser parser = factory.newSAXParser();
// 得到读取器
XMLReader reader = parser.getXMLReader();
// 设置内容处理器
BeanListHandler handler = new BeanListHandler();
reader.setContentHandler(handler);
//读取xml文档
// reader.parse("C:/Users/Administrator/Desktop/XMLceshi.txt");
reader.parse("C:/Users/Administrator/Desktop/SeattleXmlCeshi.osm");
}
public static void main(String[] args) throws ParserConfigurationException, SAXException, IOException{

long startTime=System.currentTimeMillis();//记录开始时间
Date dateStart =new Date();

XMLreadersecond readersecond=new XMLreadersecond();
readersecond.readPolyAndPointXml();

DBconnect dbmysql=new DBconnect();
Connection connection=dbmysql.connmssql();
try{

System.out.println("入库已完成!正在清理无效数据,请稍后。。。。。。");
String strSql="";
strSql = "delete from poly where id in (select distinct(id) from poly where polyType=3) and polyType=2";//将字段updateData全部设为0,以区分新增数据
PreparedStatement psStatement = connection.prepareStatement(strSql);
psStatement.executeUpdate();

}catch(Exception ex){
ex.printStackTrace();
}
long endTime=System.currentTimeMillis();//记录结束时间
float excTime=(float)(endTime-startTime)/1000;
int hours= (int)(excTime/3600);
int minutes=(int)((excTime%3600)/60);
float seconds=(excTime%3600)%60;
SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
System.out.println("完成! 开始于:"+df.format(dateStart)+" 结束于:"+df.format(new Date())+" 耗时:"+ hours+ "时:"+minutes+"分:"+seconds+"秒! ");

System.exit(0);
}
//。。。。。。。。。。。。。。。。以上为第三种方法,可取。。。。。。。。。。。。。//

}
class BeanListHandler extends DefaultHandler {
private String curretntag="";
private String idcontents="";
private String versioncontents="";
private String uidioncontents="";
private String usercontents="";
private String loncontents="";
private String latcontents="";
private String changesetcontents="";
private String timestampcontents="";
private String visiblecontents="";
private String kvcontentsW="";
private String kvcontentsN="";
private String pointids="";
private Nodes nodes;
private Way way;

private List<Nodes> nodeslist;
private List<Way> waylist;

boolean Type=false,temTympe=false;
private Integer countp=0;
private Integer countw=0;
private Integer plagN=0;//用于只提取前两个标签
private Integer plagW=0;//用于只提取前两个标签


public void startDocument() throws SAXException{

nodeslist = new ArrayList<Nodes>();
waylist = new ArrayList<Way>();
//System.out.println("正在读取Xml文档,请倒数2秒钟。。。。。。。。 ");
System.out.println("历史数据正在入库,如果数据量过大需要一段时间,请耐心等待。。。。。。。。 ");

}

public void startElement(String uri, String localName, String qName,
Attributes attributes) throws SAXException {
if("node".equals(qName)) {
if(attributes.getValue("id")!=null&&attributes.getValue("id")!="")
idcontents=attributes.getValue("id");
else
idcontents="0";
if(attributes.getValue("version")!=null&&attributes.getValue("version")!="")
versioncontents=attributes.getValue("version");
else
versioncontents="0";
if(attributes.getValue("uid")!=null&&attributes.getValue("uid")!="")
uidioncontents=attributes.getValue("uid");
else
uidioncontents="0";
if(attributes.getValue("user")!=null&&attributes.getValue("user")!="")
usercontents=attributes.getValue("user");
else
usercontents="0";
if(attributes.getValue("lon")!=null&&attributes.getValue("lon")!="")
loncontents=attributes.getValue("lon");
else
loncontents="0";
if(attributes.getValue("lat")!=null&&attributes.getValue("lat")!="")
latcontents=attributes.getValue("lat");
else
latcontents="0";
if(attributes.getValue("changeset")!=null&&attributes.getValue("changeset")!="")
changesetcontents=attributes.getValue("changeset");
else
changesetcontents="0";
if(attributes.getValue("timestamp")!=null&&attributes.getValue("timestamp")!="")
timestampcontents=attributes.getValue("timestamp");
else
timestampcontents="0";
// if(attributes.getValue("visible")!=null&&attributes.getValue("visible")!="") //不再存储visible字段
// visiblecontents=attributes.getValue("visible");
// else
// visiblecontents="0";
curretntag="node";
countp++;

}
if("node".equals(curretntag)&& "tag".equals(qName)&&plagN<4) {
String kcontents="";
String vcontents="";
kcontents=attributes.getValue("k");
vcontents=attributes.getValue("v");
kvcontentsN+=kcontents+"="+vcontents+";";
plagN++;//用于只提取前4个标签
}

if("way".equals(qName)) {
//对way操作
if(attributes.getValue("id")!=null&&attributes.getValue("id")!="")
idcontents=attributes.getValue("id");
else
idcontents="0";
if(attributes.getValue("version")!=null&&attributes.getValue("version")!="")
versioncontents=attributes.getValue("version");
else
versioncontents="0";
if(attributes.getValue("uid")!=null&&attributes.getValue("uid")!="")
uidioncontents=attributes.getValue("uid");
else
uidioncontents="0";
if(attributes.getValue("user")!=null&&attributes.getValue("user")!="")
usercontents=attributes.getValue("user");
else
usercontents="0";
if(attributes.getValue("changeset")!=null&&attributes.getValue("changeset")!="")
changesetcontents=attributes.getValue("changeset");
else
changesetcontents="0";
if(attributes.getValue("timestamp")!=null&&attributes.getValue("timestamp")!="")
timestampcontents=attributes.getValue("timestamp");
else
timestampcontents="0";
// if(attributes.getValue("visible")!=null&&attributes.getValue("visible")!="") //不再存储visible字段
// visiblecontents=attributes.getValue("visible");
// else
// visiblecontents="0";
curretntag="way";
countw++;
}

if("way".equals(curretntag)&&"tag".equals(qName)&&plagW<2) {
String kcontents="";
String vcontents="";
kcontents=attributes.getValue("k");
vcontents=attributes.getValue("v");
kvcontentsW+=kcontents+"="+vcontents+";";
plagW++;//用于只提取前两个标签
}
//对nd操作
if("way".equals(curretntag)&&"nd".equals(qName)) {
String ref="";
ref=attributes.getValue("ref");
pointids+=ref+";";
}
}

@Override
public void endElement(String uri, String localName, String qName)
throws SAXException {
//对node进行处理
if("node".equals(qName)){

nodes=new Nodes();
nodes.setId(idcontents);
nodes.setVersion(versioncontents);
nodes.setUid(uidioncontents);
nodes.setUser(usercontents);
nodes.setLon(loncontents);
nodes.setLat(latcontents);
nodes.setChangeset(changesetcontents);
nodes.setTimestamp(timestampcontents);
nodes.setVisible(visiblecontents);
nodes.setTag(kvcontentsN);

nodeslist.add(nodes);
//对要存满的waylist进行处理
if(nodeslist.size()>=100000)
{
DBconnect dbmysql=new DBconnect();
Connection connection=dbmysql.connmssql();
for(int i=0;i<nodeslist.size();i++){
try {
String strSql="";
strSql= "Insert into point(id,version,lon,lat,userid,username,changeset,timestamp,visible,tag,updateData) Values(?,?,?,?,?,?,?,?,?,?,0)";
//System.out.println("Point Id:"+nodeslist.get(i).getId()+" version: "+nodeslist.get(i).getVersion()+" timestamp:"+nodeslist.get(i).getTimestamp()+" tag:"+ nodeslist.get(i).getTag());
PreparedStatement ps = connection.prepareStatement(strSql);
ps.setLong(1, Long.parseLong(nodeslist.get(i).getId()));
ps.setInt(2, Integer.parseInt(nodeslist.get(i).getVersion()));
ps.setString(3,nodeslist.get(i).getLon());
ps.setString(4,nodeslist.get(i).getLat());
ps.setLong(5, Long.parseLong(nodeslist.get(i).getUid()));
ps.setString(6, nodeslist.get(i).getUser());
ps.setString(7, nodeslist.get(i).getChangeset());
ps.setTimestamp(8, Timestamp.valueOf(nodeslist.get(i).getTimestamp().replace("T", " ").replace("Z", "")));
ps.setString(9, nodeslist.get(i).getVisible());
ps.setString(10, nodeslist.get(i).getTag());
ps.executeUpdate();
} catch (SQLException e) {
e.printStackTrace();
}

}
nodeslist.clear();
}
kvcontentsN="";
curretntag="";
plagN=0;
nodes=null;
}

//对way进行处理
if("way".equals(qName)){

//对前面没有处理完的nodeslist集合进行处理
if(!nodeslist.isEmpty())
{
DBconnect dbmysql=new DBconnect();
Connection connection=dbmysql.connmssql();
for(int i=0;i<nodeslist.size();i++){
try {
String strSql="";
strSql= "Insert into point(id,version,lon,lat,userid,username,changeset,timestamp,visible,tag,updateData) Values(?,?,?,?,?,?,?,?,?,?,0)";
// System.out.println("Point Id:"+nodeslist.get(i).getId()+" version: "+nodeslist.get(i).getVersion()+" timestamp:"+nodeslist.get(i).getTimestamp()+" tag:"+nodeslist.get(i).getTag());
PreparedStatement ps = connection.prepareStatement(strSql);
ps.setLong(1, Long.parseLong(nodeslist.get(i).getId()));
ps.setInt(2, Integer.parseInt(nodeslist.get(i).getVersion()));
ps.setString(3,nodeslist.get(i).getLon());
ps.setString(4,nodeslist.get(i).getLat());
ps.setLong(5, Long.parseLong(nodeslist.get(i).getUid()));
ps.setString(6, nodeslist.get(i).getUser());
ps.setString(7, nodeslist.get(i).getChangeset());
ps.setTimestamp(8, Timestamp.valueOf(nodeslist.get(i).getTimestamp().replace("T", " ").replace("Z", "")));
ps.setString(9, nodeslist.get(i).getVisible());
ps.setString(10, nodeslist.get(i).getTag());
ps.executeUpdate();
} catch (SQLException e) {
e.printStackTrace();
}

}
nodeslist.clear();
}

way=new Way();
nodes=new Nodes();
way.setId(idcontents);
way.setVersion(versioncontents);
way.setUid(uidioncontents);
way.setUser(usercontents);
way.setChangeset(changesetcontents);
way.setTimestamp(timestampcontents);
way.setTag(kvcontentsW);
way.setPointids(pointids);
way.setVisible(visiblecontents);

waylist.add(way);

//对要存满的waylist进行处理
if(waylist.size()>=100000)
{
DBconnect dbmysql=new DBconnect();
Connection connection=dbmysql.connmssql();
for(int i=0;i<waylist.size();i++){
try{

String strSql="";
if(polygonOrPolyline(waylist.get(i).getPointids()))
Type=true;//polygon;
else
Type=false;
if(Type==true){
//strSql = "Insert into polygon(id,version,userid,username,changeset,timestamp,visible,tag,pointids) Values(?,?,?,?,?,?,?,?,?)";
strSql = "Insert into poly(id,version,userid,username,changeset,timestamp,visible,tag,pointids,polyType,updateData) Values(?,?,?,?,?,?,?,?,?,3,0)";
}
if(Type==false){
//strSql = "Insert into polyline(id,version,userid,username,changeset,timestamp,visible,tag,pointids) Values(?,?,?,?,?,?,?,?,?)";
strSql = "Insert into poly(id,version,userid,username,changeset,timestamp,visible,tag,pointids,polyType,updateData) Values(?,?,?,?,?,?,?,?,?,2,0)";
}
//System.out.println("Poly Id:"+waylist.get(i).getId()+" version: "+waylist.get(i).getVersion()+" tag:"+waylist.get(i).getTag());
PreparedStatement ps = connection.prepareStatement(strSql);
ps.setLong(1, Long.parseLong(waylist.get(i).getId()));
ps.setInt(2, Integer.parseInt( waylist.get(i).getVersion()));
ps.setLong(3, Long.parseLong(waylist.get(i).getUid()));
ps.setString(4, waylist.get(i).getUser());
ps.setString(5, waylist.get(i).getChangeset());
ps.setTimestamp(6, Timestamp.valueOf(waylist.get(i).getTimestamp().replace("T", " ").replace("Z", "")));
ps.setString(7, waylist.get(i).getVisible());
ps.setString(8,waylist.get(i).getTag());
ps.setString(9, waylist.get(i).getPointids());
ps.executeUpdate();
}
catch(Exception e){
e.printStackTrace();
}
}
waylist.clear();
}
kvcontentsW="";
curretntag="";
pointids="";
plagW=0;
way=null;
}
}

public void endDocument() throws SAXException{

//对前面没有处理完的waylist进行处理
if(!waylist.isEmpty())
{
DBconnect dbmysql=new DBconnect();
Connection connection=dbmysql.connmssql();
for(int i=0;i<waylist.size();i++){
try{

String strSql="";
if(polygonOrPolyline(waylist.get(i).getPointids()))
Type=true;//polygon;
else
Type=false;
if(Type==true){

//strSql = "Insert into polygon(id,version,userid,username,changeset,timestamp,visible,tag,pointids) Values(?,?,?,?,?,?,?,?,?)";
strSql = "Insert into poly(id,version,userid,username,changeset,timestamp,visible,tag,pointids,polyType,updateData) Values(?,?,?,?,?,?,?,?,?,3,0)";
}
if(Type==false){
//strSql = "Insert into polyline(id,version,userid,username,changeset,timestamp,visible,tag,pointids) Values(?,?,?,?,?,?,?,?,?)";
strSql = "Insert into poly(id,version,userid,username,changeset,timestamp,visible,tag,pointids,polyType,updateData) Values(?,?,?,?,?,?,?,?,?,2,0)";
}
//System.out.println("Poly Id:"+waylist.get(i).getId()+" version: "+waylist.get(i).getVersion()+" tag:"+waylist.get(i).getTag());
PreparedStatement ps = connection.prepareStatement(strSql);
ps.setLong(1, Long.parseLong(waylist.get(i).getId()));
ps.setInt(2, Integer.parseInt( waylist.get(i).getVersion()));
ps.setLong(3, Long.parseLong(waylist.get(i).getUid()));
ps.setString(4, waylist.get(i).getUser());
ps.setString(5, waylist.get(i).getChangeset());
ps.setTimestamp(6, Timestamp.valueOf(waylist.get(i).getTimestamp().replace("T", " ").replace("Z", "")));
ps.setString(7, waylist.get(i).getVisible());
ps.setString(8,waylist.get(i).getTag());
ps.setString(9, waylist.get(i).getPointids());
ps.executeUpdate();
}
catch(Exception e){
e.printStackTrace();
}
}
waylist.clear();
}

}

private boolean polygonOrPolyline(String nodes){
//用于区分线和面数据

//If return true, the poly is a polygon.
if(nodes.length()<1) return false;
String[] ss = nodes.split(";");
if(ss.length<4) return false;
if(ss[0].equals(ss[ss.length-1])) return true;
else return false;
}
}

class Nodes{
//对应于XML中的node,数据库中的Point
private String id;
private String version;
private String uid;
private String user;
private String lon;
private String lat;
private String changeset;
private String timestamp;
private String visible;
private String tag;

public String getTag() {
return tag;
}
public void setTag(String tag) {
this.tag = tag;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getVersion() {
return version;
}
public void setVersion(String version) {
this.version = version;
}
public String getUid() {
return uid;
}
public void setUid(String uid) {
this.uid = uid;
}
public String getUser() {
return user;
}
public void setUser(String user) {
this.user = user;
}
public String getLon() {
return lon;
}
public void setLon(String lon) {
this.lon = lon;
}
public String getLat() {
return lat;
}
public void setLat(String lat) {
this.lat = lat;
}
public String getChangeset() {
return changeset;
}
public void setChangeset(String changeset) {
this.changeset = changeset;
}
public String getTimestamp() {
return timestamp;
}
public void setTimestamp(String timestamp) {
this.timestamp = timestamp;
}
public String getVisible() {
return visible;
}
public void setVisible(String visible) {
this.visible = visible;
}
}
class Way{
//对应于XML中的way、数据库中的Polylin和Polygon
private String id;
private String version;
private String uid;
private String user;
private String changeset;
private String timestamp;
private String tag;
private String point;
private String pointids;
private String visible;

public String getVisible() {
return visible;
}
public void setVisible(String visible) {
this.visible = visible;
}
public String getPointids() {
return pointids;
}
public void setPointids(String pointids) {
this.pointids = pointids;
}
public String getPoint() {
return point;
}
public void setPoint(String point) {
this.point = point;
}
public String getTag() {
return tag;
}
public void setTag(String tag) {
this.tag = tag;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getVersion() {
return version;
}
public void setVersion(String version) {
this.version = version;
}
public String getUid() {
return uid;
}
public void setUid(String uid) {
this.uid = uid;
}
public String getUser() {
return user;
}
public void setUser(String user) {
this.user = user;
}
public String getChangeset() {
return changeset;
}
public void setChangeset(String changeset) {
this.changeset = changeset;
}
public String getTimestamp() {
return timestamp;
}
public void setTimestamp(String timestamp) {
this.timestamp = timestamp;
}

}


class DBconnect {
//连接sqlServer数据库
public String dbname = "ceshiXML";
public Connection connmssql() {

String driverName = "com.microsoft.sqlserver.jdbc.SQLServerDriver";
String dbURL = "jdbc:sqlserver://localhost:1433; DatabaseName="+this.dbname;
String userName = "sa"; //默认用户名
String userPwd = "123456"; //密码
Connection dbConn = null;
try {
Class.forName(driverName);
dbConn = DriverManager.getConnection(dbURL, userName, userPwd);
} catch (Exception e) {
e.printStackTrace();
}
return dbConn;
}

}