将http://www.sina.com/的HTML保存到c://shengchengdeHTML.html

最新推荐文章于 2025-12-16 21:51:40 发布

转载最新推荐文章于 2025-12-16 21:51:40 发布 · 1.4k 阅读

文章标签：

#html #c #exception #string #null #import

J2SE 专栏收录该内容

1 篇文章

订阅专栏

本文介绍了一个简单的Java程序，该程序能够从指定的网址抓取HTML内容并将其保存到本地文件中。此外，还对HTML中的特定标签进行了处理，包括<style>和<script>标签。



Java code

import java.net.*;

import java.io.*;

import java.util.regex.Pattern;



public class CreateHTML {

    public static void main(String[] args) {

        CreateHTML uc = new CreateHTML();

        uc.creatHTML("http://www.sina.com/", "c://shengchengdeHTML.html");

    }



    public void creatHTML(String webURL, String local) {

        //new File(local);

        FileWriter fw = null;

        BufferedWriter bw = null;

        try {

            fw = new FileWriter(local);

            bw = new BufferedWriter(fw);

        } catch (Exception ex) {

            ex.printStackTrace();

        }

        StringBuffer document = new StringBuffer();

        try {

            URL url = new URL(webURL);

            URLConnection conn = url.openConnection();

            BufferedReader reader = new BufferedReader(new InputStreamReader(

                    conn.getInputStream()));

            String line = null;

            while ((line = reader.readLine()) != null)

                document.append(line + " ");

            reader.close();

        } catch (MalformedURLException e) {

            e.printStackTrace();

        } catch (IOException e) {

            e.printStackTrace();

        }

        //System.out.println(document.toString());

        String strTemp = document.toString();

        String temp = null;

        int j = 0;

        for (int i = 0; i < strTemp.length(); i++) {

            if (i<strTemp.length()&&strTemp.charAt(i) == '>') {

                j = i;

                i++;

                if (i > strTemp.length() - 2) {

                    temp = strTemp;

                    addLine(temp, bw);

                    break;

                }

                while (Pattern.compile("//s{1}").matcher("" + strTemp.charAt(i)).find()) { //跳过空格

                    i++;

                }

                if (i<strTemp.length()&&strTemp.charAt(i) == '<') {

                    temp = strTemp.substring(0, i);

                    strTemp = strTemp.substring(i, strTemp.length());

                    addLine(temp, bw);

                    i = 0;

                } else {

                    i = j;

                }

            }



            if (strTemp.substring(0, 6).equalsIgnoreCase("<style")){

                while(true){

                    if (strTemp.charAt(i) == '}') {

                        temp = strTemp.substring(0, i + 1);

                        strTemp = strTemp.substring(i + 1, strTemp.length());

                        addLine(temp, bw);

                        i = 0;

                    }

                    if(strTemp.substring(i,i+8).equalsIgnoreCase("</style>")){

                        break;

                    }

                    i++;

                }

                i=0;

            }



            if (strTemp.substring(0, 7).equalsIgnoreCase("<script")){

                while(true){

                    if (strTemp.charAt(i) == '{'||strTemp.charAt(i) == '}'||strTemp.charAt(i) == ';') {

                        temp = strTemp.substring(0, i + 1);

                        strTemp = strTemp.substring(i + 1, strTemp.length());

                        addLine(temp, bw);

                        i = -1;

                    }

                    i++;

                    if(i>=strTemp.length()-9){

                        break;

                    }

                    if(strTemp.substring(i,i+9).equalsIgnoreCase("</script>")){

                        temp = strTemp.substring(0, i);

                        strTemp = strTemp.substring(i, strTemp.length());

                        addLine(temp, bw);

                        i = 0;

                        break;

                    }

                }

            }

        }

        //将上面步骤忽略的代码加入HTML页面

        while(Pattern.compile("//s{1}").matcher("" + strTemp.charAt(0)).find()){//去掉首空格、/t等

            strTemp=strTemp.substring(1);

        }

        if(strTemp.toLowerCase().indexOf("</html>")>0){//去掉首空格等后，如果不以</html>开头，则表示上面的步骤，没有完成了所有代码的格式化。这种情况需要将剩余代码加进HTML中

            addLine(strTemp, bw);

        }

        try {

            bw.flush();

            bw.close();

            fw.close();

        } catch (Exception ex) {

            ex.printStackTrace();

        }

    }



    private void addLine(String strLine, BufferedWriter bw) {

        try {

            bw.write(strLine);

            bw.newLine();

        } catch (Exception ex) {

            ex.printStackTrace();

        }

    }

}