《2021博客之星年度总评选》数据采集样例程序

《2021博客之星年度总评选》数据采集Java样例程序


pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>1</groupId>
    <artifactId>_psimplemvn</artifactId>
    <version>1.0-SNAPSHOT</version>

    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <configuration>
                    <source>8</source>
                    <target>8</target>
                </configuration>
            </plugin>
        </plugins>
    </build>

    <dependencies>
        <dependency>
            <groupId>org.seleniumhq.selenium</groupId>
            <artifactId>selenium-chrome-driver</artifactId>
            <version>4.0.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>3.17</version>
        </dependency>
    </dependencies>


</project>

2020线上投票博客之星数据采集|样例程序

/**
 * Copyright (C), 2000-2021, XXX有限公司
 * FileName: BlogStarStatisticsTest
 * Author: wangyetao
 * Date: 21-12-26 23:38:10
 * Description: 线上投票博客之星数据采集
 * <p>
 * History:
 * <author> 作者姓名
 * <time> 修改时间
 * <version> 版本号
 * <desc> 版本描述
 *
 * <author> wangyetao
 * <time> 2021年 12月 27日 星期一 07:46:27 CST
 * <version> 版本号
 * <desc> 最近一次修改
 *
 * <author> wangyetao
 * <time> 2021年 12月 27日 星期一 20:37:59 CST
 * <version> 版本号
 * <desc> 最近一次修改
 */
package simple.call.blogstar;

import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import simple.call.util.StringUtil;
import simple.call.util.TimeUtil;

import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * @ClassName: BlogStarStatisticsTest
 * @Description: java类描述
 * @Author: wangyetao
 * @Date: 21-12-26 23:38:10
 */
public class BlogStarStatisticsTest {

    private static String outPutPath = "/home/wangyetao/IdeaProjects/_psimplemvn/src/main/java/simple/call/blogstar/";
    private static String filename = "blog_star2020";
    private static String sheetname = filename.toUpperCase();
    private static String suffix = ".xlsx";
    private static FileOutputStream outputStream;
    private static ArrayList<BlogStar> blogStars;
    private static String url = "https://bss.youkuaiyun.com/m/topic/blog_star2020";//blog_star2020 url

    //测试用例
    public static void main(String[] args) throws InterruptedException {


        //预先设置驱动
        System.setProperty("webdriver.chrome.driver", "/usr/bin/chromedriver");

        // Initialize your driver as you normally would:
        ChromeDriver driver = new ChromeDriver();
        driver.get(url);

        //<ul id="blogList">
        //单条数据DOM结构
        //            <li>
        //              <a target="_blank" href="https://bss.youkuaiyun.com/m/topic/blog_star2020/detail?username=qq_34361283">
        //                <span class="num">001</span>
        //                <div class="avatar">
        //                  <img src="https://profile.csdnimg.cn/8/5/9/1_qq_34361283" alt="">
        //                </div>
        //                <div class="name">✎ℳ๓₯㎕...雲淡風輕</div>
        //                <div class="level"><i class="icon-level icon-level-5"></i>码龄6年</div>
        //                <div class="statistics">
        //                  <p class="blog-num">2020年度原创博文:77 篇</p>
        //                  <p class="current-vote">当前票数: <em>392</em> 票</p>
        //                </div>
        //                <div class="footer">
        //                  <span class="vote">投TA一票</span>
        //                  <span class="pipe"></span>
        //                  <span class="canvassing">为TA拉票</span>
        //                </div>
        //              </a>
        //            </li>
        //<ul/>

        blogStars = new ArrayList<BlogStar>();

        //稍等页面渲染完成
        Thread.sleep(3000);

        List<WebElement> search_results = driver.findElements(By.xpath("//*[@id=\"blogList\"]/li"));
        for (int i = 0; i < search_results.size(); i++) {
            WebElement element = search_results.get(i);
            BlogStar blogStar = new BlogStar();

            //录入时间
            blogStar.createTime = TimeUtil.getStampToString(System.currentTimeMillis(), "yyyy-MM-dd HH:mm:ss");
            //序号
            blogStar.num = element.findElement(By.className("num")).getText();
            //博客简称 name
            blogStar.name = element.findElement(By.className("name")).getText();
            //头像图片 avatarurl
            blogStar.avatarUrl = element.findElement(By.tagName("img")).getAttribute("src");
            //码龄
            blogStar.intlevel = StringUtil.getInts(element.findElement(By.className("level")).getText())[0];
            //blog-num年度原创博文数、current-vote当前票数
            blogStar.intBlogNum = StringUtil.getInts(element.findElement(By.className("blog-num")).getText())[1];
            blogStar.intCurrentVote = StringUtil.getInts(element.findElement(By.className("current-vote")).getText())[0];
            blogStars.add(blogStar);
        }
        driver.close();

        ArrayList<String> heads = new ArrayList<String>();
        heads.add("序号");
        heads.add("博客简称");
        heads.add("小头像url");
        heads.add("码龄(年)");
        heads.add("年度原创博文数");
        heads.add("当前票数");
        heads.add("录入时间");

        //CSVUtils.createCSVFile(heads, blogStars, outPutPath, filename);

        System.out.println("Creating excel");
        try {

            XSSFWorkbook workbook = new XSSFWorkbook();
            XSSFSheet sheet = workbook.createSheet(sheetname);
            //设置列宽
            for (int i = 0; i < heads.size(); i++) {
                if (i == 0) {
                    sheet.setColumnWidth(i, 6 * 256);
                } else if (i == 6) {
                    sheet.setColumnWidth(i, 20 * 256);
                } else {
                    sheet.setColumnWidth(i, 15 * 256);
                }
            }

            Row row = null;
            Cell cell = null;
            //插入第一行数据的表头
            //创建第一行
            row = sheet.createRow(0);
            for (int i = 0; i < heads.size(); i++) {
                cell = row.createCell(i);
                cell.setCellValue(heads.get(i));
            }

            int rowNum = 1;
            int colNum = 0;

            //组合表格:行、列
            for (BlogStar blogStar : blogStars) {
                row = sheet.createRow(rowNum++);

                cell = row.createCell(colNum++);
                cell.setCellValue(blogStar.num);
                cell = row.createCell(colNum++);
                cell.setCellValue(blogStar.name);
                cell = row.createCell(colNum++);
                cell.setCellValue(blogStar.avatarUrl);
                cell = row.createCell(colNum++);
                cell.setCellValue(blogStar.intlevel);
                cell = row.createCell(colNum++);
                cell.setCellValue(blogStar.intBlogNum);
                cell = row.createCell(colNum++);
                cell.setCellValue(blogStar.intCurrentVote);
                cell = row.createCell(colNum++);
                cell.setCellValue(blogStar.createTime);
                colNum = 0;
            }

            outputStream = new FileOutputStream(outPutPath + filename + suffix);
            //写入数据到Excel
            workbook.write(outputStream);
            //关闭流
            outputStream.close();
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }

        System.out.println("Done");

    }

}

采集样例

  • 在这里插入图片描述

2020投票贡献排行榜数据采集|样例程序

/**
 * Copyright (C), 2000-2021, XXX有限公司
 * FileName: BlogStarStatisticsVoteLeaderboardList
 * Author: wangyetao
 * Date: 21-12-27 02:43:32
 * Description: 投票贡献排行榜
 * <p>
 * History:
 * <author> 作者姓名
 * <time> 修改时间
 * <version> 版本号
 * <desc> 版本描述
 */
package simple.call.blogstar;

import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import simple.call.util.StringUtil;
import simple.call.util.TimeUtil;

import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * @ClassName: BlogStarStatisticsVoteLeaderboardList
 * @Description: java类描述
 * @Author: wangyetao
 * @Date: 21-12-27 02:43:32
 */
public class BlogStarStatisticsVoteLeaderboardList {
    private static String outPutPath = "/home/wangyetao/IdeaProjects/_psimplemvn/src/main/java/simple/call/blogstar/";
    private static String filename = "aa518189";
    private static String sheetname = filename;
    private static String suffix = ".xlsx";
    private static FileOutputStream outputStream;
    private static ArrayList<BlogStar> blogVotes;
    private static String url = "https://bss.youkuaiyun.com/m/topic/blog_star2020/detail?username=aa518189";

    //测试用例
    public static void main(String[] args) throws InterruptedException {


        //预先设置驱动
        System.setProperty("webdriver.chrome.driver", "/usr/bin/chromedriver");

        // Initialize your driver as you normally would:
        ChromeDriver driver = new ChromeDriver();
        driver.get(url);

        //<ul id="voteLeaderboardList">
        //单条数据DOM结构
        //            <li class="best-blogger-wrapper">
        //              <div class="left">
        //                <span class="num">1</span>
        //                <span class="text">swagLi</span>
        //                <span class="icon-level icon-level-3"></span>
        //                <span class="best-blogger"></span>
        //              </div>
        //              <div class="right">
        //                <span class="code-age">码龄4年</span>
        //                <span class="vote-num">36票</span>
        //              </div>
        //            </li>
        //<ul/>


        blogVotes = new ArrayList<BlogStar>();

        //稍等页面渲染完成
        Thread.sleep(2000);

        List<WebElement> search_results = driver.findElements(By.xpath("//*[@id=\"voteLeaderboardList\"]/li"));
        for (int i = 0; i < search_results.size(); i++) {
            WebElement element = search_results.get(i);
            BlogStar blogStar = new BlogStar();

            //录入时间
            blogStar.createTime = TimeUtil.getStampToString(System.currentTimeMillis(), "yyyy-MM-dd HH:mm:ss");
            //编号
            blogStar.num = element.findElement(By.className("num")).getText();
            //博粉名称
            blogStar.name = element.findElement(By.className("text")).getText();
            //码龄(年)
            blogStar.intlevel = StringUtil.getInts(element.findElement(By.className("code-age")).getText())[0];
            //支持票数
            blogStar.intCurrentVote = StringUtil.getInts(element.findElement(By.className("vote-num")).getText())[0];
            blogVotes.add(blogStar);
        }
        driver.close();

        ArrayList<String> heads = new ArrayList<String>();
        heads.add("编号");
        heads.add("博粉名称");
        heads.add("码龄(年)");
        heads.add("支持票数");
        heads.add("录入时间");

        //CSVUtils.createCSVFile(heads, blogVotes, outPutPath, filename);

        System.out.println("Creating excel");
        try {

            XSSFWorkbook workbook = new XSSFWorkbook();
            XSSFSheet sheet = workbook.createSheet(sheetname);
            //设置列宽
            for (int i = 0; i < heads.size(); i++) {
                if (i == 0) {
                    sheet.setColumnWidth(i, 6 * 256);
                } else if (i == 4) {
                    sheet.setColumnWidth(i, 20 * 256);
                } else {
                    sheet.setColumnWidth(i, 15 * 256);
                }
            }

            Row row = null;
            Cell cell = null;
            //插入第一行数据的表头
            //创建第一行
            row = sheet.createRow(0);
            for (int i = 0; i < heads.size(); i++) {
                cell = row.createCell(i);
                cell.setCellValue(heads.get(i));
            }

            int rowNum = 1;
            int colNum = 0;

            //组合表格:行、列
            for (BlogStar blogStar : blogVotes) {
                row = sheet.createRow(rowNum++);

                cell = row.createCell(colNum++);
                cell.setCellValue(blogStar.num);
                cell = row.createCell(colNum++);
                cell.setCellValue(blogStar.name);
                cell = row.createCell(colNum++);
                cell.setCellValue(blogStar.intlevel);
                cell = row.createCell(colNum++);
                cell.setCellValue(blogStar.intCurrentVote);
                cell = row.createCell(colNum++);
                cell.setCellValue(blogStar.createTime);
                colNum = 0;
            }

            outputStream = new FileOutputStream(outPutPath + filename + suffix);
            //写入数据到Excel
            workbook.write(outputStream);
            //关闭流
            outputStream.close();
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }

        System.out.println("Done");

    }
}

采集样例

  • 在这里插入图片描述

2021线上评分TOP90数据采集|样例程序

/**
 * Copyright (C), 2000-2021, XXX有限公司
 * FileName: Blogstar2021
 * Author: wangyetao
 * Date: 21-12-28 15:50:02
 * Description: 线上评分TOP90数据采集,输出blogstar2021.xlsx
 * <p>
 * History:
 * <author> 作者姓名
 * <time> 修改时间
 * <version> 版本号
 * <desc> 版本描述
 *
 * <author> wangyetao
 * <time> 2022年 01月 01日 星期六 06:38:36 CST
 * <version> 版本号
 * <desc> 最近一次修改
 */
package simple.call.blogstar;

import org.apache.poi.common.usermodel.HyperlinkType;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CreationHelper;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFHyperlink;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;

import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import simple.call.util.StringUtil;
import simple.call.util.TimeUtil;

import java.io.*;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

/**
 * @ClassName: Blogstar2021
 * @Description: java类描述
 * @Author: wangyetao
 * @Date: 21-12-28 15:50:02
 */
public class Blogstar2021 {

    private static String outPutPath = "/home/wangyetao/IdeaProjects/_psimplemvn/src/main/java/simple/call/blogstar/";
    //预先在指定目录创建好blogstar2021.xlsx Excel文件,防止FileNotFoundException
    private static String filename = "blogstar2021";
    private static String sheetname = TimeUtil.getStampToString(System.currentTimeMillis(), "yyyyMMddHHmmss");
    private static String suffix = ".xlsx";
    //用于保留Excel中的原内容
    private static FileInputStream inputStream;
    //用于往Excel中追加写入新内容
    private static FileOutputStream outputStream;
    private static ArrayList<BlogStar> blogStars;
    private static String url = "https://www.youkuaiyun.com/blogstar2021";//blogstar2021 url

    //测试用例
    public static void main(String[] args) throws InterruptedException {

        //预先设置驱动
        System.setProperty("webdriver.chrome.driver", "/usr/bin/chromedriver");

        // Initialize your driver as you normally would:
        ChromeDriver driver = new ChromeDriver();
        driver.get(url);

        blogStars = new ArrayList<BlogStar>();

        //稍等页面渲染完成
        Thread.sleep(3000);

        List<WebElement> lis = driver.findElement(By.className("authorscoring-nav")).findElements(By.tagName("li"));

        for (int i = 0; i < lis.size(); i++) {
            WebElement element = lis.get(i);
            element.click();
            Thread.sleep(2000);

            List<WebElement> boxs = driver.findElements(By.className("scoreitem"));
            for (int j = 0; j < boxs.size(); j++) {
                WebElement box = boxs.get(j);

                BlogStar blogStar = new BlogStar();

                //领域
                blogStar.field = element.getText();
                //博主简称
                blogStar.name = box.findElement(By.className("name")).getText();
                List<WebElement> dts = box.findElements(By.tagName("dt"));
                //排名
                blogStar.ranking = dts.get(0).getText();
                //分数
                blogStar.score = StringUtil.getInts(dts.get(1).getText())[0];
                //评分页
                blogStar.scorePage = box.findElements(By.tagName("a")).get(2).getAttribute("href");
                //blogUrl
                blogStar.blogUrl = box.findElement(By.tagName("a")).getAttribute("href");
                //录入时间
                blogStar.createTime = TimeUtil.getStampToString(System.currentTimeMillis(), "yyyy-MM-dd HH:mm:ss");


                blogStars.add(blogStar);
            }

            lis = driver.findElement(By.className("authorscoring-nav")).findElements(By.tagName("li"));
        }
        driver.close();

        ArrayList<String> heads = new ArrayList<String>();
        heads.add("领域");
        heads.add("博主简称");
        heads.add("领域排名");
        heads.add("总评分");
        heads.add("参赛互动页");
        heads.add("博主首页");
        heads.add("录入时间");

        //CSVUtils.createCSVFile(heads, blogStars, outPutPath, filename);

        System.out.println("Creating excel");
        try {
            File file = new File(outPutPath + filename + suffix);
            inputStream = new FileInputStream(file);

            XSSFWorkbook workbook = new XSSFWorkbook(inputStream);
            XSSFSheet sheet = workbook.createSheet(sheetname);
            //设置列宽
            sheet.setColumnWidth(0, 16 * 256);
            sheet.setColumnWidth(1, 20 * 256);
            sheet.setColumnWidth(2, 10 * 256);
            sheet.setColumnWidth(3, 10 * 256);
            sheet.setColumnWidth(4, 20 * 256);
            sheet.setColumnWidth(5, 20 * 256);
            sheet.setColumnWidth(6, 25 * 256);

            Row row = null;
            Cell cell = null;
            //插入第一行数据的表头
            //创建第一行
            row = sheet.createRow(0);
            for (int i = 0; i < heads.size(); i++) {
                cell = row.createCell(i);
                cell.setCellValue(heads.get(i));
            }

            int rowNum = 1;
            int colNum = 0;

            //组合表格:行、列
            for (BlogStar blogStar : blogStars) {
                row = sheet.createRow(rowNum++);

                cell = row.createCell(colNum++);
                cell.setCellValue(blogStar.field);
                cell = row.createCell(colNum++);
                cell.setCellValue(blogStar.name);
                cell = row.createCell(colNum++);
                cell.setCellValue(blogStar.ranking);
                cell = row.createCell(colNum++);
                cell.setCellValue(blogStar.score);

                cell = row.createCell(colNum++);
                CreationHelper createHelper = workbook.getCreationHelper();
                XSSFHyperlink link = (XSSFHyperlink) createHelper.createHyperlink(HyperlinkType.URL);
                link.setAddress(blogStar.scorePage);
                cell.setHyperlink(link);
                cell.setCellValue(blogStar.scorePage);

                cell = row.createCell(colNum++);
                XSSFHyperlink link2 = (XSSFHyperlink) createHelper.createHyperlink(HyperlinkType.URL);
                link2.setAddress(blogStar.blogUrl);
                cell.setHyperlink(link2);
                cell.setCellValue(blogStar.blogUrl);

                cell = row.createCell(colNum++);
                cell.setCellValue(blogStar.createTime);
                colNum = 0;
            }

            outputStream = new FileOutputStream(file);
            //写入数据到Excel
            workbook.write(outputStream);
            //关闭流
            inputStream.close();
            outputStream.close();
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }

        System.out.println("Done");

    }
}

采集样例

  • 在这里插入图片描述

博主博客文章统计|样例程序

/**
 * Copyright (C), 2000-2021, XXX有限公司
 * FileName: BlogArticleStatistics
 * Author: wangyetao
 * Date: 21-12-27 05:20:10
 * Description: 博主博客文章统计
 * <p>
 * History:
 * <author> 作者姓名
 * <time> 修改时间
 * <version> 版本号
 * <desc> 版本描述
 *
 * <author> wangyetao
 * <time> 2021年 12月 27日 星期一 07:18:11 CST
 * <version> 版本号
 * <desc> 最近一次修改
 */
package simple.call.blogstar;

import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import simple.call.util.StringUtil;

import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * @ClassName: BlogArticleStatistics
 * @Description: java类描述
 * @Author: wangyetao
 * @Date: 21-12-27 05:20:10
 */
public class BlogArticleStatistics {
    private static String outPutPath = "/home/wangyetao/IdeaProjects/_psimplemvn/src/main/java/simple/call/blogstar/";
    private static String filename = "u014132947";
    private static String sheetname = "article_" + filename;
    private static String suffix = ".xlsx";
    private static FileOutputStream outputStream;
    private static ArrayList<Article> blogArticles;
    private static String url = "https://blog.youkuaiyun.com/u014132947";//博主url

    //测试用例
    public static void main(String[] args) throws InterruptedException {


        //预先设置驱动
        System.setProperty("webdriver.chrome.driver", "/usr/bin/chromedriver");

        // Initialize your driver as you normally would:
        ChromeDriver driver = new ChromeDriver();
        driver.get(url);

        //<div class="article-list">
        //
        //单条文章数据DOM结构
        //<div class="article-item-box csdn-tracking-statistics" data-articleid="122148075">
        //    <h4 class="">
        //        <a href="https://blog.youkuaiyun.com/u014132947/article/details/122148075"
        //        data-report-click="{&quot;spm&quot;:&quot;1001.2014.3001.5190&quot;}" target="_blank">
        //            <span class="article-type type-1 float-none">原创</span>
        //                获取世界人口排名2021
        //                </a>
        //    </h4>
        //    <p class="content">
        //                获取世界人口排名2021,Linux配置Selenium+Chrome+Java实现自动化测试
        //                </p>
        //    <div class="info-box d-flex align-content-center">
        //        <p>
        //            <span class="date">2021-12-26 06:16:59</span>
        //            <span class="read-num"><img src="https://csdnimg.cn/release/blogv2/dist/pc/img/readCountWhite.png" alt="">105</span>
        //        </p>
        //    </div>
        //    <div class="opt-box">
        //        <button class="btn-opt" data-type="top">置顶</button>
        //        <a class="btn-opt" data-type="edit" href="https://editor.youkuaiyun.com/md?articleId=122148075">编辑</a>
        //    </div>
        //</div>
        //
        //<div/>


        blogArticles = new ArrayList<Article>();

        //稍等页面渲染完成
        Thread.sleep(2000);

        //nextElement
        WebElement nextElement = driver.findElement(By.className("js-page-next"));
        int dataNum = Integer.valueOf(driver.findElement(By.id("container-header-blog")).getAttribute("data-num"));
        while (nextElement != null && blogArticles.size() < dataNum) {
            List<WebElement> search_results = driver.findElements(By.className("article-item-box"));
            for (int i = 0; i < search_results.size(); i++) {
                WebElement element = search_results.get(i);
                Article article = new Article();

                //文章标题
                article.title = element.findElement(By.tagName("a")).getText();
                //简要内容
                article.content = element.findElement(By.className("content")).getText();
                //发布时间
                article.publishTime = element.findElement(By.className("date")).getText();
                //访问数
                article.readNum = StringUtil.getInts(element.findElement(By.className("read-num")).getText())[0];
                blogArticles.add(article);
            }

            nextElement.click();
            //稍等页面渲染完成
            Thread.sleep(3000);
            nextElement = driver.findElement(By.className("js-page-next"));
        }

        driver.close();

        ArrayList<String> heads = new ArrayList<String>();
        heads.add("文章标题");
        heads.add("简要内容");
        heads.add("发布时间");
        heads.add("访问数");

        //CSVUtils.createCSVFile(heads, blogArticles, outPutPath, filename);

        System.out.println("Creating excel");
        try {

            XSSFWorkbook workbook = new XSSFWorkbook();
            XSSFSheet sheet = workbook.createSheet(sheetname);
            //设置列宽
            for (int i = 0; i < heads.size(); i++) {
                if (i == 3) {
                    sheet.setColumnWidth(i, 6 * 256);
                } else {
                    sheet.setColumnWidth(i, 15 * 256);
                }
            }

            Row row = null;
            Cell cell = null;
            //插入第一行数据的表头
            //创建第一行
            row = sheet.createRow(0);
            for (int i = 0; i < heads.size(); i++) {
                cell = row.createCell(i);
                cell.setCellValue(heads.get(i));
            }

            int rowNum = 1;
            int colNum = 0;

            //组合表格:行、列
            for (Article article : blogArticles) {
                row = sheet.createRow(rowNum++);

                cell = row.createCell(colNum++);
                cell.setCellValue(article.title);
                cell = row.createCell(colNum++);
                cell.setCellValue(article.content);
                cell = row.createCell(colNum++);
                cell.setCellValue(article.publishTime);
                cell = row.createCell(colNum++);
                cell.setCellValue(article.readNum);
                colNum = 0;
            }

            outputStream = new FileOutputStream(outPutPath + filename + suffix);
            //写入数据到Excel
            workbook.write(outputStream);
            //关闭流
            outputStream.close();
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }

        System.out.println("Done");
    }
}

采集样例

  • 在这里插入图片描述

2021线上评分[入围名单TOP100]数据采集|样例程序

/**
 * Copyright (C), 2000-2021, XXX有限公司
 * FileName: Blogstar2021
 * Author: wangyetao
 * Date: 2022年 01月 08日 星期六 21:49:17 CST
 * Description: 线上评分[入围名单TOP100]数据采集,输出blogstar2021.xlsx
 * <p>
 * History:
 * <author> 作者姓名
 * <time> 修改时间
 * <version> 版本号
 * <desc> 版本描述
 *
 * <author> wangyetao
 * <time> 2022年 01月 08日 星期六 22:03:26 CST
 * <version> 版本号
 * <desc> 最近一次修改
 */
package simple.call.blogstar;

import org.apache.poi.common.usermodel.HyperlinkType;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CreationHelper;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFHyperlink;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import simple.call.util.StringUtil;
import simple.call.util.TimeUtil;

import java.io.*;
import java.util.ArrayList;
import java.util.List;

/**
 * @ClassName: Blogstar2021
 * @Description: java类描述
 * @Author: wangyetao
 * @Date: 21-12-28 15:50:02
 */
public class Blogstar2021_02 {

    private static String outPutPath = "/home/wangyetao/IdeaProjects/_psimplemvn/src/main/java/simple/call/blogstar/";
    //预先在指定目录创建好blogstar2021.xlsx Excel文件,防止FileNotFoundException
    private static String filename = "blogstar2021";
    private static String sheetname = TimeUtil.getStampToString(System.currentTimeMillis(), "yyyyMMddHHmmss");
    private static String suffix = ".xlsx";
    //用于保留Excel中的原内容
    private static FileInputStream inputStream;
    //用于往Excel中追加写入新内容
    private static FileOutputStream outputStream;
    private static ArrayList<BlogStar> blogStars;
    private static String url = "https://www.youkuaiyun.com/blogstar2021";//blogstar2021 url

    //测试用例
    public static void main(String[] args) throws InterruptedException {

        //预先设置驱动
        System.setProperty("webdriver.chrome.driver", "/usr/bin/chromedriver");

        // Initialize your driver as you normally would:
        ChromeDriver driver = new ChromeDriver();
        driver.get(url);

        blogStars = new ArrayList<BlogStar>();

        //稍等页面渲染完成
        Thread.sleep(3000);

        List<WebElement> lis = driver.findElement(By.className("authorscoring-nav")).findElements(By.tagName("li"));

        for (int i = 0; i < lis.size(); i++) {
            WebElement element = lis.get(i);
            element.click();
            Thread.sleep(2000);

            List<WebElement> boxs = driver.findElements(By.className("authorscoring-cont-box"));
            for (int j = 0; j < boxs.size(); j++) {
                WebElement box = boxs.get(j);

                BlogStar blogStar = new BlogStar();

                //领域
                blogStar.field = element.getText();
                //博主简称
                blogStar.name = box.findElement(By.className("name")).getText();
                //排名
                List<WebElement> dts = box.findElements(By.tagName("dt"));
                blogStar.ranking = dts.get(0).getText();
                //分数
                blogStar.score = StringUtil.getInts(dts.get(1).getText())[0];
                //评分页
                blogStar.scorePage = box.findElements(By.tagName("a")).get(2).getAttribute("href");
                //blogUrl
                blogStar.blogUrl = box.findElement(By.tagName("a")).getAttribute("href");
                //录入时间
                blogStar.createTime = TimeUtil.getStampToString(System.currentTimeMillis(), "yyyy-MM-dd HH:mm:ss");


                blogStars.add(blogStar);
            }

            lis = driver.findElement(By.className("authorscoring-nav")).findElements(By.tagName("li"));
        }
        driver.close();

        ArrayList<String> heads = new ArrayList<String>();
        heads.add("领域");
        heads.add("博主简称");
        heads.add("领域排名");
        heads.add("总评分");
        heads.add("参赛互动页");
        heads.add("博主首页");
        heads.add("录入时间");

        //CSVUtils.createCSVFile(heads, blogStars, outPutPath, filename);

        System.out.println("Creating excel");
        try {
            File file = new File(outPutPath + filename + suffix);
            inputStream = new FileInputStream(file);

            XSSFWorkbook workbook = new XSSFWorkbook(inputStream);
            XSSFSheet sheet = workbook.createSheet(sheetname);
            //设置列宽
            sheet.setColumnWidth(0, 16 * 256);
            sheet.setColumnWidth(1, 20 * 256);
            sheet.setColumnWidth(2, 10 * 256);
            sheet.setColumnWidth(3, 10 * 256);
            sheet.setColumnWidth(4, 20 * 256);
            sheet.setColumnWidth(5, 20 * 256);
            sheet.setColumnWidth(6, 25 * 256);

            Row row = null;
            Cell cell = null;
            //插入第一行数据的表头
            //创建第一行
            row = sheet.createRow(0);
            for (int i = 0; i < heads.size(); i++) {
                cell = row.createCell(i);
                cell.setCellValue(heads.get(i));
            }

            int rowNum = 1;
            int colNum = 0;

            //组合表格:行、列
            for (BlogStar blogStar : blogStars) {
                row = sheet.createRow(rowNum++);

                cell = row.createCell(colNum++);
                cell.setCellValue(blogStar.field);
                cell = row.createCell(colNum++);
                cell.setCellValue(blogStar.name);
                cell = row.createCell(colNum++);
                cell.setCellValue(blogStar.ranking);
                cell = row.createCell(colNum++);
                cell.setCellValue(blogStar.score);

                cell = row.createCell(colNum++);
                CreationHelper createHelper = workbook.getCreationHelper();
                XSSFHyperlink link = (XSSFHyperlink) createHelper.createHyperlink(HyperlinkType.URL);
                link.setAddress(blogStar.scorePage);
                cell.setHyperlink(link);
                cell.setCellValue(blogStar.scorePage);

                cell = row.createCell(colNum++);
                XSSFHyperlink link2 = (XSSFHyperlink) createHelper.createHyperlink(HyperlinkType.URL);
                link2.setAddress(blogStar.blogUrl);
                cell.setHyperlink(link2);
                cell.setCellValue(blogStar.blogUrl);

                cell = row.createCell(colNum++);
                cell.setCellValue(blogStar.createTime);
                colNum = 0;
            }

            outputStream = new FileOutputStream(file);
            //写入数据到Excel
            workbook.write(outputStream);
            //关闭流
            inputStream.close();
            outputStream.close();
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }

        System.out.println("Done");

    }
}

采集样例

在这里插入图片描述

作于2021年 12月 27日 星期一 04:02:17 CST,归档于2021年 12月 27日 星期一 20:48:42 CST。

评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

dnbug Blog

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值