java读取html文件,截取<body>标签中内容

本文介绍了一个Java程序如何读取特定路径下的HTML文件,并从中抽取内容与图片链接的方法。该程序通过构造文件路径来定位HTML文件,利用Java内置方法进行文件读取,同时通过正则表达式提取HTML中的<body>部分内容及<img>标签内的图片路径。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

 1     public String readfile(String filePath){
 2         File file = new File(filePath);  
 3         InputStream input = null;
 4         try {
 5             input = new FileInputStream(file);
 6         } catch (FileNotFoundException e) {
 7             e.printStackTrace();
 8         }  
 9         StringBuffer buffer = new StringBuffer();  
10         byte[] bytes = new byte[1024];
11         try {
12             for(int n ; (n = input.read(bytes))!=-1 ; ){  
13                 buffer.append(new String(bytes,0,n,"GBK"));  
14             }
15         } catch (IOException e) {
16             e.printStackTrace();
17         }
18 //        System.out.println(buffer);
19         return buffer.toString();  
20     }
21     
22      public String getBody(String val) {
23           String start = "<body>";
24           String end = "</body>";
25           int s = val.indexOf(start) + start.length();
26           int e = val.indexOf(end);
27         return val.substring(s, e);
28     }
29     
 1     public static void main(String [] args){
 2         OaDao m = new OaDao();
 3 //        String sql = "SELECT sth,xdh FROM TK_ST_0331 where sth='022012050101131000100' and rownum <=10";
 4         String sql = "select t.sth , t.stgjz ,t.stly, x.mc from TK_ST_0331 t ,TK_STK_ST_0331 k,TK_TX X  where t.sth = k.sth AND X.BH = t.tx and rownum <10 ";
 5         List<OaVo> datalist= m.findAll(sql);
 6         for(OaVo vo : datalist){
 7             System.out.println(vo.getVal1()+"///"+vo.getVal2());
 8             
 9 //            String sth = "022012010100000100100";
10             String sth = vo.getVal1();
11             String kmh = sth.substring(0, 2);    //科目号
12             String nf = sth.substring(2, 6);    //年份
13             String yf = sth.substring(6,10);    //月份
14             String serialno = sth.substring(10, 16);    //序列号
15             String stxl = sth.substring(16, 19);    //题型
16             String path ="/"+kmh+"/"+nf+"/"+yf+"/"+serialno+"/"+stxl+"/";
17             
18             String tm_path ="H:/tk_source/"+kmh+"/"+yf+"/"+serialno+"/"+stxl+"/"+sth+"_tm.htm";
19             String da_path ="H:/tk_source/"+kmh+"/"+yf+"/"+serialno+"/"+stxl+"/"+sth+"_da.htm";
20             String jx_path ="H:/tk_source/"+kmh+"/"+yf+"/"+serialno+"/"+stxl+"/"+sth+"_jx.htm";
21             
22     //        String path = "H:/tk_source/02/0101/000001/001/022012010100000100100_da.htm";
23             
24             String tm = m.getBody(m.readfile(tm_path));
25             System.out.println("----------------------题目------------------------------");
26             System.out.println(tm);
27             
28             String da = m.getBody(m.readfile(da_path));
29             System.out.println("----------------------答案------------------------------");
30             System.out.println(da);
31             
32             
33             String jx = m.getBody(m.readfile(da_path));
34             System.out.println("----------------------解析------------------------------");
35             System.out.println(jx);
36         }
37     }

 

 1 /**
 2      * 从HTML源码中提取图片路径,最后以一个 String 类型的 List 返回,如果不包含任何图片,则返回一个 size=0 的List
 3      * 需要注意的是,此方法只会提取以下格式的图片:.jpg|.bmp|.eps|.gif|.mif|.miff|.png|.tif|.tiff|.svg|.wmf|.jpe|.jpeg|.dib|.ico|.tga|.cut|.pic
 4      * @param htmlCode HTML源码
 5      * @return <img>标签 src 属性指向的图片地址的List集合
 6      * @author Carl He
 7      */
 8     public static List<String> getImageSrc(String htmlCode) {
 9         List<String> imageSrcList = new ArrayList<String>();
10         Pattern p = Pattern.compile("<img//b[^>]*//bsrc//b//s*=//s*('|/")?([^'/"/n/r/f>]+(//.jpg|//.bmp|//.eps|//.gif|//.mif|//.miff|//.png|//.tif|//.tiff|//.svg|//.wmf|//.jpe|//.jpeg|//.dib|//.ico|//.tga|//.cut|//.pic)//b)[^>]*>", Pattern.CASE_INSENSITIVE);
11         Matcher m = p.matcher(htmlCode);
12         String quote = null;
13         String src = null;
14         while (m.find()) {
15             quote = m.group(1);
16             src = (quote == null || quote.trim().length() == 0) ? m.group(2).split("//s+")[0] : m.group(2);
17             imageSrcList.add(src);
18         }
19         return imageSrcList;
20     }

 

转载于:https://www.cnblogs.com/huanglibin/p/6671202.html

这个是BeanShell Sampler 的代码import java.net.*; import java.io.*; try { // 创建TCP连接 Socket socket = new Socket("10.192.32.178", 9810); socket.setSoTimeout(5000); // 发送请求 OutputStream out = socket.getOutputStream(); String request = "000947QPC QP2001 <?xml version=\"1.0\" encoding=\"GBK\"?><root><head><transeq>20250626000003s1120110230</transeq><servtp>CMS</servtp><prcscd>QP2001</prcscd><pwdmod></pwdmod><accflg></accflg><router>66</router><termno></termno><cityno></cityno><brchno>0000000</brchno><userid>zxcx002</userid></head><body><req><tranCode>QP2001</tranCode><busiChannel>000000</busiChannel><transq>20211126000003s1120110230</transq><rptType>A</rptType></req></body></root>"; out.write(request.getBytes("GBK")); out.flush(); // 接收响应 BufferedReader in = new BufferedReader(new InputStreamReader(socket.getInputStream())); String response = ""; String line; while ((line = in.readLine()) != null) { response += line + "\n"; } response= response.replace("000550QPC QP2001 ", ""); // 将响应保存到JMeter变量中 vars.put("tcpResponse", response); // 关闭连接 in.close(); out.close(); socket.close(); // 设置取样器成功 SampleResult.setSuccessful(true); SampleResult.setResponseData(response, "GBK"); } catch (Exception e) { SampleResult.setSuccessful(false); SampleResult.setResponseMessage(e.toString()); } 以下是 BeanShell PostProcessor的代码 import javax.xml.parsers.*; import org.w3c.dom.*; import javax.xml.xpath.*; String response = prev.getResponseDataAsString(); try { // 创建Document对象 DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = factory.newDocumentBuilder(); Document doc = builder.parse(new java.io.ByteArrayInputStream(response.getBytes())); // 创建XPath对象 XPathFactory xPathfactory = XPathFactory.newInstance(); XPath xpath = xPathfactory.newXPath(); // 提取参数(示例:提取<sessionId>标签的值) String url = xpath.evaluate("/root/body/resp/rptURL", doc); String rptTocken = xpath.evaluate("/root/body/resp/rptTocken", doc); String rptURL = url.substring(7, 20); String port = url.substring(21, 25); String local = url.substring(26); // 保存参数 log.info("rptURL:"+rptURL); log.info("port:"+port); log.info("local:"+local); log.info("rptTocken:"+rptTocken); vars.put("rptURL", rptURL); vars.put("port", port); vars.put("local", local); vars.put("rptTocken", rptTocken); } catch (Exception e) { log.error("解析XML失败: " + e.getMessage()); } http请求配置如图 非gui运行代码后出现 ERROR - jmeter.util.BeanShellTestElement: 解析XML失败: Content is not allowed in prolog. 2025/07/09 13:40:28 ERROR - jmeter.util.BeanShellTestElement: 解析XML失败: Content is not allowed in prolog. 2025/07/09 13:40:28 ERROR - jmeter.util.BeanShellTestElement: 解析XML失败: Content is not allowed in prolog. 服务器是linux jmeter安装在window 解决办法 怎么操作
最新发布
07-10
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值