用htmlParser把HTML页面信息解析到文本中


1 html源码

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<!-- saved from url=(0045)http://10.42.158.2/info/news.asp -->
<HTML><HEAD><TITLE>散文</TITLE>
<SCRIPT language=JavaScript>
self.moveTo(0,0); 
self.resizeTo(screen.width,screen.height); 
</SCRIPT>

<STYLE type=text/css>A:link {
	FONT-FAMILY: "仿宋_GB2312"; COLOR: #ff6600; FONT-SIZE: 16px; TEXT-DECORATION: underline
}
A:active {
	FONT-FAMILY: "仿宋_GB2312"; COLOR: #ff6600; FONT-SIZE: 16px; TEXT-DECORATION: underline
}
A:visited {
	FONT-FAMILY: "仿宋_GB2312"; COLOR: #ff6600; FONT-SIZE: 16px; TEXT-DECORATION: underline
}
A:hover {
	FONT-FAMILY: "仿宋_GB2312"; COLOR: #ff6600; FONT-SIZE: 16px; TEXT-DECORATION: underline
}
</STYLE>

<META name=GENERATOR content="MSHTML 8.00.6001.18702"></HEAD>
<BODY >
<SCRIPT language=JavaScript>
</SCRIPT>
<TABLE border=0 cellSpacing=0 cellPadding=0 width=778 align=center>
  <TBODY>
  <TR>
    <TD class=text> </TD></TR>
  <TR>
    <TD height=30>
      <TABLE class=text border=0 cellSpacing=0 cellPadding=0 width="89%" 
      align=center>
        <TBODY>
        <TR>
          <TD>当前位置:首页>>散文</TD>
          <TD ></TD></TR>
  <TR>
    <TD >
      <TABLE border=0 cellSpacing=0 cellPadding=0 width="90%" align=center>
        <TBODY>
        <TR>
          <TD>
            <TABLE border=0 cellSpacing=0 cellPadding=0 width="83%" 
align=center>
              <TBODY>
              <TR>
                <TD height=160 colSpan=2></TD></TR>
              <TR>
                <TD class=briefingred colSpan=2 align=middle>大海</TD></TR>
              
              <TR>
                <TD class=news height=20 colSpan=2 align=middle>
                  <HR color=#ff0000 SIZE=5>
                </TD></TR>
              <TR>
                <TD height=10 colSpan=2></TD></TR>
              <TR>
                <TD class=briefingtext colSpan=2 align=justify>
                  <DIV></DIV>
            
                  <DIV 
                  align=left><BR>一滴水怎样才能不干涸”相传,古代有一位学者这样问他的弟子。
孤零零的一滴水,论容量只能以毫升计,体积也微乎其微,风能吹干它,阳光也能晒干它,其寿命能有几何……弟子答不上来。:<BR>学者说:“把它放到大海里去。;<BR>是的,一滴水的寿命是短暂的。但当它汇入大海,与浩瀚的大海融为一体时,就获得了新的生命。大海永远不会干涸,一滴水就永存于大海之中,<BR>雷锋同志说:“一滴水只有放进大海里才能永远不干,一个人只有当他把自己和集体事业融合在一起的时候才能有力量。”<BR>可见,团结就有力量。;<BR>大海,总是冥冥之中给予人一种澎湃的感觉。但见,大海,一浪未平一浪又起,如同人的命运,时起时落,不可能有唾手可得的成功。人的一生尚不可能永远风平浪静,更何况一个国家,一个民族呢欲速则不达,如果我们的民族不是经历了风风雨雨,大挫大折的锻炼,又怎么能如此的经久不衰呢</DIV>
                 
                            
                 </TD></TR>
              <TR>
                <TD height=120 
      colSpan=2></TD></TR></TBODY></TABLE></TD></TR></TBODY></TABLE></TD></TR></TBODY></TABLE>
<TABLE cellSpacing=0 cellPadding=0 width=676 align=center>
 </TABLE></BODY></HTML>


2 config.properties文件配置

# excute interval default three minute
#时间(单位:分钟)
interval=2
#bayonet share dir
#文件输出的父路径
path=F:\\ftp_root
#url=http://10.42.158.2/info/list3.asp?id=8&page=1&num=30&date=1
#html的路径
url=http://10.42.158.2/info/list3.asp?id=8&page=1&num=30&date=1
#输出文件名
fileName=sea.txt

3java代码

package com.odin.cn.warning;

import java.io.BufferedOutputStream;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.InputStream;
import java.io.OutputStream;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.regex.Pattern;

import jcifs.smb.SmbFile;
import jcifs.smb.SmbFileOutputStream;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.util.NodeList;

import com.odin.cn.util.ConfigUtils;
import com.odin.cn.util.PropertiesUtils;
import com.odin.cn.util.Sea;

public class SetArticle {
	private static SimpleDateFormat longFormate = new SimpleDateFormat(
			"yyyy-MM-dd HH:mm:ss");
	private static final Logger logger = Logger
			.getLogger(SetArticle.class);
	public ConfigUtils config = new ConfigUtils();

	public SetArticle(ConfigUtils config) {
		this.config = config;
	}

	

	// http 网络解析
	public List<Sea> parseHtmlByHttp(String url) {
		String body = "{}";
		List<Sea> sea = new ArrayList<Sea>();
		DefaultHttpClient httpclient = new DefaultHttpClient();
		try {
			HttpPost httpget = new HttpPost(url);
			HttpResponse response = httpclient.execute(httpget);
			HttpEntity entity = response.getEntity();
			body = EntityUtils.toString(entity, "GBK");
			Parser parser = new Parser(body);
			// 2.1、自定义一个Filter,用于过滤<Frame >标签,然后取得标签中的src属性值
			NodeFilter frameNodeFilter = new NodeFilter() {
				@Override
				public boolean accept(Node node) {
					if (node.getText().startsWith("td")) {
						return true;
					} else {
						return false;
					}
				}
			};
			// 3、使用parser根据filter来取得所有符合条件的节点
			NodeList nodeList = parser
					.extractAllNodesThatMatch(frameNodeFilter);
			for (int i = 0; i < nodeList.size(); i++) {
				Node node = nodeList.elementAt(i);
				if (node instanceof TableColumn) {
					String attr = ((TableColumn) node).getAttribute("class");
					String align = ((TableColumn) node).getAttribute("align");
					String colspan = ((TableColumn) node)
							.getAttribute("colspan");
					if ("2".equals(colspan) && "justify".equals(align)
							&& "briefingtext".equals(attr)) {
						NodeList n = node.getChildren();
						for (int j = 0; j < n.size(); j++) {
							Node bNode = n.elementAt(j);
							sea = foreachNode(sea, bNode);
						}
						List<Sea> list = new ArrayList<Sea>();
						for(int j=1;j<sea.size();j++){
							Sea pw=sea.get(j);
							String message=pw.getMessage();
							if(message!=null&&!"".equals(message)){
								list.add(pw);
							}
						}
						sea=list;
					}
				}
			}
			
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			httpclient.getConnectionManager().shutdown();
		}
		return sea;
	}

	//递归获取到节点下的文本信息
	private List<Sea> foreachNode(List<Sea> list, Node node) {
		if (node == null) {
			return list;
		}
		if (node.getChildren() != null) {
			NodeList nodeList = node.getChildren();
			for (int i = 0; i < nodeList.size(); i++) {
				Node bNode = nodeList.elementAt(i);
				foreachNode(list, bNode);
			}
		} else if (node instanceof TextNode) {
			String message = node.getText();
			message = node.getText();
			message = message.replaceAll(",", ",").replaceAll("", "")
					.replaceAll("\r\n", "");
			int index = message.indexOf(",");
			if (index > 0&&list.size()!=1) {
				message = message.substring(0, index);
			}
			if (!"".equals(message)) {
				Sea pw = new Sea();
				pw.setMessage(message);
				list.add(pw);
			}
		}
		return list;

	}

	
	// 保存文件
	public void saveFile() {
		BufferedWriter bw = null;
		try {
			List<Sea> sea = parseHtmlByHttp(config.getUrl());
			if(sea==null||sea.size()==0){
				logger.info("保存到文件夹失败:未获取到网页信息");
				return;
			}
			// List<Leader> leaders=parseHtmlByFile();
			StringBuffer sb = new StringBuffer();
			SimpleDateFormat dFormat = new SimpleDateFormat("yyyy-MM-dd");
			String date = dFormat.format(new Date());
			sb.append("{date:\"" + date + "\",data:[");
			if (!sea.isEmpty()) {
				for (int i = 0; i < sea.size(); i++) {
					Sea pw = sea.get(i);
					if (i == sea.size() - 1) {
						sb.append("{message:\"" + pw.getMessage() + "\"}");
					} else {
						sb.append("{message:\"" + pw.getMessage() + "\"},");
					}
				}
			}
			sb.append("]}");
			String filename = config.path + "\\" + config.fileName;
			bw = new BufferedWriter(new FileWriter(filename));
			bw.write(sb.toString(), 0, sb.length());
			bw.flush(); // 刷新缓冲的输出流
			logger.info("成功保存文件");
		} catch (Exception e) {
			logger.info("保存文件失败" + e.getMessage());
		} finally {
			try {
				if (bw != null) {
					
					bw.close();
				}
			} catch (Exception e2) {
				logger.info("关闭连接发送异常" + e2.getMessage());
			}
		}

	}

	public static void main(String[] args) {
		PropertyConfigurator.configure("conf/log4j.properties");
		final ConfigUtils config = PropertiesUtils.getConfig();
		final SetArticle leader = new SetArticle(config);
		logger.info("启动成功");
		/*
		 * try { leader.shareFile(); } catch (Exception e) {
		 * e.printStackTrace(); }
		 */
		try {
			// 启用线程调用
			new Thread(new Runnable() {
				boolean initRun = true;

				@Override
				public void run() {
					while (initRun) {
						try {
							// 5秒调用一次
							leader.saveFile();
							Thread.currentThread();
							// 5分钟调用一次
							Thread.sleep(1000 * 60 * config.interval);
						} catch (Exception e) {
							logger.info("线程执行出现异常:"
									+ longFormate.format(new java.util.Date())
									+ e.getMessage());
						}
					}
				}
			}).start();
		} catch (Exception e) {
			logger.info("线程执行出现异常:" + longFormate.format(new java.util.Date())
					+ e.getMessage());
		} finally {
			logger.info("线程终止时间2:" + longFormate.format(new java.util.Date()));
		}
	}
}

4属性文件加载类

package com.odin.cn.util;

import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.Properties;

import org.apache.log4j.Logger;


public class PropertiesUtils {
private static Logger logger = Logger.getLogger(PropertiesUtils.class);
	
	public static ConfigUtils getConfig(){
		ConfigUtils db=new ConfigUtils();
		String relativelyPath = System.getProperty("user.dir").replace("\\", "/")+"/conf/"+"config.properties"; 
		InputStream is = null;
		Properties dbProps = new Properties();
		try {
			is = new BufferedInputStream(new FileInputStream(relativelyPath));
			dbProps.load(is);
    		logger.info("读取数据库config.properties配置文件成功!");
		} catch (Exception e) {
			logger.error(e.getMessage());
		}
		db.setPath(dbProps.getProperty("path"));
		db.setInterval(Integer.parseInt(dbProps.getProperty("interval")));
		
		db.setFileName(dbProps.getProperty("fileName"));
		db.setUrl(dbProps.getProperty("url"));
		return db;
	}
}

优质内容筛选与推荐>>
1、woff/woff2字体404找不到
2、poj 2757 : 最长上升子序列(JAVA)
3、js 里面的 function 与 Function
4、调查问卷
5、Python从菜鸟到高手(18):类与方法的私有化


长按二维码向我转账

受苹果公司新规定影响,微信 iOS 版的赞赏功能被关闭,可通过二维码转账支持公众号。

    阅读
    好看
    已推荐到看一看
    你的朋友可以在“发现”-“看一看”看到你认为好看的文章。
    已取消,“好看”想法已同步删除
    已推荐到看一看 和朋友分享想法
    最多200字,当前共 发送

    已发送

    朋友将在看一看看到

    确定
    分享你的想法...
    取消

    分享想法到看一看

    确定
    最多200字,当前共

    发送中

    网络异常,请稍后重试

    微信扫一扫
    关注该公众号