《2021博客之星年度总评选》数据采集Java样例程序
文章目录
pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>1</groupId>
<artifactId>_psimplemvn</artifactId>
<version>1.0-SNAPSHOT</version>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>8</source>
<target>8</target>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-chrome-driver</artifactId>
<version>4.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.17</version>
</dependency>
</dependencies>
</project>
2020线上投票博客之星数据采集|样例程序
/**
* Copyright (C), 2000-2021, XXX有限公司
* FileName: BlogStarStatisticsTest
* Author: wangyetao
* Date: 21-12-26 23:38:10
* Description: 线上投票博客之星数据采集
* <p>
* History:
* <author> 作者姓名
* <time> 修改时间
* <version> 版本号
* <desc> 版本描述
*
* <author> wangyetao
* <time> 2021年 12月 27日 星期一 07:46:27 CST
* <version> 版本号
* <desc> 最近一次修改
*
* <author> wangyetao
* <time> 2021年 12月 27日 星期一 20:37:59 CST
* <version> 版本号
* <desc> 最近一次修改
*/
package simple.call.blogstar;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import simple.call.util.StringUtil;
import simple.call.util.TimeUtil;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* @ClassName: BlogStarStatisticsTest
* @Description: java类描述
* @Author: wangyetao
* @Date: 21-12-26 23:38:10
*/
public class BlogStarStatisticsTest {
private static String outPutPath = "/home/wangyetao/IdeaProjects/_psimplemvn/src/main/java/simple/call/blogstar/";
private static String filename = "blog_star2020";
private static String sheetname = filename.toUpperCase();
private static String suffix = ".xlsx";
private static FileOutputStream outputStream;
private static ArrayList<BlogStar> blogStars;
private static String url = "https://bss.youkuaiyun.com/m/topic/blog_star2020";//blog_star2020 url
//测试用例
public static void main(String[] args) throws InterruptedException {
//预先设置驱动
System.setProperty("webdriver.chrome.driver", "/usr/bin/chromedriver");
// Initialize your driver as you normally would:
ChromeDriver driver = new ChromeDriver();
driver.get(url);
//<ul id="blogList">
//单条数据DOM结构
// <li>
// <a target="_blank" href="https://bss.youkuaiyun.com/m/topic/blog_star2020/detail?username=qq_34361283">
// <span class="num">001</span>
// <div class="avatar">
// <img src="https://profile.csdnimg.cn/8/5/9/1_qq_34361283" alt="">
// </div>
// <div class="name">✎ℳ๓₯㎕...雲淡風輕</div>
// <div class="level"><i class="icon-level icon-level-5"></i>码龄6年</div>
// <div class="statistics">
// <p class="blog-num">2020年度原创博文:77 篇</p>
// <p class="current-vote">当前票数: <em>392</em> 票</p>
// </div>
// <div class="footer">
// <span class="vote">投TA一票</span>
// <span class="pipe"></span>
// <span class="canvassing">为TA拉票</span>
// </div>
// </a>
// </li>
//<ul/>
blogStars = new ArrayList<BlogStar>();
//稍等页面渲染完成
Thread.sleep(3000);
List<WebElement> search_results = driver.findElements(By.xpath("//*[@id=\"blogList\"]/li"));
for (int i = 0; i < search_results.size(); i++) {
WebElement element = search_results.get(i);
BlogStar blogStar = new BlogStar();
//录入时间
blogStar.createTime = TimeUtil.getStampToString(System.currentTimeMillis(), "yyyy-MM-dd HH:mm:ss");
//序号
blogStar.num = element.findElement(By.className("num")).getText();
//博客简称 name
blogStar.name = element.findElement(By.className("name")).getText();
//头像图片 avatarurl
blogStar.avatarUrl = element.findElement(By.tagName("img")).getAttribute("src");
//码龄
blogStar.intlevel = StringUtil.getInts(element.findElement(By.className("level")).getText())[0];
//blog-num年度原创博文数、current-vote当前票数
blogStar.intBlogNum = StringUtil.getInts(element.findElement(By.className("blog-num")).getText())[1];
blogStar.intCurrentVote = StringUtil.getInts(element.findElement(By.className("current-vote")).getText())[0];
blogStars.add(blogStar);
}
driver.close();
ArrayList<String> heads = new ArrayList<String>();
heads.add("序号");
heads.add("博客简称");
heads.add("小头像url");
heads.add("码龄(年)");
heads.add("年度原创博文数");
heads.add("当前票数");
heads.add("录入时间");
//CSVUtils.createCSVFile(heads, blogStars, outPutPath, filename);
System.out.println("Creating excel");
try {
XSSFWorkbook workbook = new XSSFWorkbook();
XSSFSheet sheet = workbook.createSheet(sheetname);
//设置列宽
for (int i = 0; i < heads.size(); i++) {
if (i == 0) {
sheet.setColumnWidth(i, 6 * 256);
} else if (i == 6) {
sheet.setColumnWidth(i, 20 * 256);
} else {
sheet.setColumnWidth(i, 15 * 256);
}
}
Row row = null;
Cell cell = null;
//插入第一行数据的表头
//创建第一行
row = sheet.createRow(0);
for (int i = 0; i < heads.size(); i++) {
cell = row.createCell(i);
cell.setCellValue(heads.get(i));
}
int rowNum = 1;
int colNum = 0;
//组合表格:行、列
for (BlogStar blogStar : blogStars) {
row = sheet.createRow(rowNum++);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.num);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.name);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.avatarUrl);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.intlevel);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.intBlogNum);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.intCurrentVote);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.createTime);
colNum = 0;
}
outputStream = new FileOutputStream(outPutPath + filename + suffix);
//写入数据到Excel
workbook.write(outputStream);
//关闭流
outputStream.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("Done");
}
}
采集样例
2020投票贡献排行榜数据采集|样例程序
/**
* Copyright (C), 2000-2021, XXX有限公司
* FileName: BlogStarStatisticsVoteLeaderboardList
* Author: wangyetao
* Date: 21-12-27 02:43:32
* Description: 投票贡献排行榜
* <p>
* History:
* <author> 作者姓名
* <time> 修改时间
* <version> 版本号
* <desc> 版本描述
*/
package simple.call.blogstar;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import simple.call.util.StringUtil;
import simple.call.util.TimeUtil;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* @ClassName: BlogStarStatisticsVoteLeaderboardList
* @Description: java类描述
* @Author: wangyetao
* @Date: 21-12-27 02:43:32
*/
public class BlogStarStatisticsVoteLeaderboardList {
private static String outPutPath = "/home/wangyetao/IdeaProjects/_psimplemvn/src/main/java/simple/call/blogstar/";
private static String filename = "aa518189";
private static String sheetname = filename;
private static String suffix = ".xlsx";
private static FileOutputStream outputStream;
private static ArrayList<BlogStar> blogVotes;
private static String url = "https://bss.youkuaiyun.com/m/topic/blog_star2020/detail?username=aa518189";
//测试用例
public static void main(String[] args) throws InterruptedException {
//预先设置驱动
System.setProperty("webdriver.chrome.driver", "/usr/bin/chromedriver");
// Initialize your driver as you normally would:
ChromeDriver driver = new ChromeDriver();
driver.get(url);
//<ul id="voteLeaderboardList">
//单条数据DOM结构
// <li class="best-blogger-wrapper">
// <div class="left">
// <span class="num">1</span>
// <span class="text">swagLi</span>
// <span class="icon-level icon-level-3"></span>
// <span class="best-blogger"></span>
// </div>
// <div class="right">
// <span class="code-age">码龄4年</span>
// <span class="vote-num">36票</span>
// </div>
// </li>
//<ul/>
blogVotes = new ArrayList<BlogStar>();
//稍等页面渲染完成
Thread.sleep(2000);
List<WebElement> search_results = driver.findElements(By.xpath("//*[@id=\"voteLeaderboardList\"]/li"));
for (int i = 0; i < search_results.size(); i++) {
WebElement element = search_results.get(i);
BlogStar blogStar = new BlogStar();
//录入时间
blogStar.createTime = TimeUtil.getStampToString(System.currentTimeMillis(), "yyyy-MM-dd HH:mm:ss");
//编号
blogStar.num = element.findElement(By.className("num")).getText();
//博粉名称
blogStar.name = element.findElement(By.className("text")).getText();
//码龄(年)
blogStar.intlevel = StringUtil.getInts(element.findElement(By.className("code-age")).getText())[0];
//支持票数
blogStar.intCurrentVote = StringUtil.getInts(element.findElement(By.className("vote-num")).getText())[0];
blogVotes.add(blogStar);
}
driver.close();
ArrayList<String> heads = new ArrayList<String>();
heads.add("编号");
heads.add("博粉名称");
heads.add("码龄(年)");
heads.add("支持票数");
heads.add("录入时间");
//CSVUtils.createCSVFile(heads, blogVotes, outPutPath, filename);
System.out.println("Creating excel");
try {
XSSFWorkbook workbook = new XSSFWorkbook();
XSSFSheet sheet = workbook.createSheet(sheetname);
//设置列宽
for (int i = 0; i < heads.size(); i++) {
if (i == 0) {
sheet.setColumnWidth(i, 6 * 256);
} else if (i == 4) {
sheet.setColumnWidth(i, 20 * 256);
} else {
sheet.setColumnWidth(i, 15 * 256);
}
}
Row row = null;
Cell cell = null;
//插入第一行数据的表头
//创建第一行
row = sheet.createRow(0);
for (int i = 0; i < heads.size(); i++) {
cell = row.createCell(i);
cell.setCellValue(heads.get(i));
}
int rowNum = 1;
int colNum = 0;
//组合表格:行、列
for (BlogStar blogStar : blogVotes) {
row = sheet.createRow(rowNum++);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.num);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.name);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.intlevel);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.intCurrentVote);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.createTime);
colNum = 0;
}
outputStream = new FileOutputStream(outPutPath + filename + suffix);
//写入数据到Excel
workbook.write(outputStream);
//关闭流
outputStream.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("Done");
}
}
采集样例
2021线上评分TOP90数据采集|样例程序
/**
* Copyright (C), 2000-2021, XXX有限公司
* FileName: Blogstar2021
* Author: wangyetao
* Date: 21-12-28 15:50:02
* Description: 线上评分TOP90数据采集,输出blogstar2021.xlsx
* <p>
* History:
* <author> 作者姓名
* <time> 修改时间
* <version> 版本号
* <desc> 版本描述
*
* <author> wangyetao
* <time> 2022年 01月 01日 星期六 06:38:36 CST
* <version> 版本号
* <desc> 最近一次修改
*/
package simple.call.blogstar;
import org.apache.poi.common.usermodel.HyperlinkType;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CreationHelper;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFHyperlink;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import simple.call.util.StringUtil;
import simple.call.util.TimeUtil;
import java.io.*;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
/**
* @ClassName: Blogstar2021
* @Description: java类描述
* @Author: wangyetao
* @Date: 21-12-28 15:50:02
*/
public class Blogstar2021 {
private static String outPutPath = "/home/wangyetao/IdeaProjects/_psimplemvn/src/main/java/simple/call/blogstar/";
//预先在指定目录创建好blogstar2021.xlsx Excel文件,防止FileNotFoundException
private static String filename = "blogstar2021";
private static String sheetname = TimeUtil.getStampToString(System.currentTimeMillis(), "yyyyMMddHHmmss");
private static String suffix = ".xlsx";
//用于保留Excel中的原内容
private static FileInputStream inputStream;
//用于往Excel中追加写入新内容
private static FileOutputStream outputStream;
private static ArrayList<BlogStar> blogStars;
private static String url = "https://www.youkuaiyun.com/blogstar2021";//blogstar2021 url
//测试用例
public static void main(String[] args) throws InterruptedException {
//预先设置驱动
System.setProperty("webdriver.chrome.driver", "/usr/bin/chromedriver");
// Initialize your driver as you normally would:
ChromeDriver driver = new ChromeDriver();
driver.get(url);
blogStars = new ArrayList<BlogStar>();
//稍等页面渲染完成
Thread.sleep(3000);
List<WebElement> lis = driver.findElement(By.className("authorscoring-nav")).findElements(By.tagName("li"));
for (int i = 0; i < lis.size(); i++) {
WebElement element = lis.get(i);
element.click();
Thread.sleep(2000);
List<WebElement> boxs = driver.findElements(By.className("scoreitem"));
for (int j = 0; j < boxs.size(); j++) {
WebElement box = boxs.get(j);
BlogStar blogStar = new BlogStar();
//领域
blogStar.field = element.getText();
//博主简称
blogStar.name = box.findElement(By.className("name")).getText();
List<WebElement> dts = box.findElements(By.tagName("dt"));
//排名
blogStar.ranking = dts.get(0).getText();
//分数
blogStar.score = StringUtil.getInts(dts.get(1).getText())[0];
//评分页
blogStar.scorePage = box.findElements(By.tagName("a")).get(2).getAttribute("href");
//blogUrl
blogStar.blogUrl = box.findElement(By.tagName("a")).getAttribute("href");
//录入时间
blogStar.createTime = TimeUtil.getStampToString(System.currentTimeMillis(), "yyyy-MM-dd HH:mm:ss");
blogStars.add(blogStar);
}
lis = driver.findElement(By.className("authorscoring-nav")).findElements(By.tagName("li"));
}
driver.close();
ArrayList<String> heads = new ArrayList<String>();
heads.add("领域");
heads.add("博主简称");
heads.add("领域排名");
heads.add("总评分");
heads.add("参赛互动页");
heads.add("博主首页");
heads.add("录入时间");
//CSVUtils.createCSVFile(heads, blogStars, outPutPath, filename);
System.out.println("Creating excel");
try {
File file = new File(outPutPath + filename + suffix);
inputStream = new FileInputStream(file);
XSSFWorkbook workbook = new XSSFWorkbook(inputStream);
XSSFSheet sheet = workbook.createSheet(sheetname);
//设置列宽
sheet.setColumnWidth(0, 16 * 256);
sheet.setColumnWidth(1, 20 * 256);
sheet.setColumnWidth(2, 10 * 256);
sheet.setColumnWidth(3, 10 * 256);
sheet.setColumnWidth(4, 20 * 256);
sheet.setColumnWidth(5, 20 * 256);
sheet.setColumnWidth(6, 25 * 256);
Row row = null;
Cell cell = null;
//插入第一行数据的表头
//创建第一行
row = sheet.createRow(0);
for (int i = 0; i < heads.size(); i++) {
cell = row.createCell(i);
cell.setCellValue(heads.get(i));
}
int rowNum = 1;
int colNum = 0;
//组合表格:行、列
for (BlogStar blogStar : blogStars) {
row = sheet.createRow(rowNum++);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.field);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.name);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.ranking);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.score);
cell = row.createCell(colNum++);
CreationHelper createHelper = workbook.getCreationHelper();
XSSFHyperlink link = (XSSFHyperlink) createHelper.createHyperlink(HyperlinkType.URL);
link.setAddress(blogStar.scorePage);
cell.setHyperlink(link);
cell.setCellValue(blogStar.scorePage);
cell = row.createCell(colNum++);
XSSFHyperlink link2 = (XSSFHyperlink) createHelper.createHyperlink(HyperlinkType.URL);
link2.setAddress(blogStar.blogUrl);
cell.setHyperlink(link2);
cell.setCellValue(blogStar.blogUrl);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.createTime);
colNum = 0;
}
outputStream = new FileOutputStream(file);
//写入数据到Excel
workbook.write(outputStream);
//关闭流
inputStream.close();
outputStream.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("Done");
}
}
采集样例
博主博客文章统计|样例程序
/**
* Copyright (C), 2000-2021, XXX有限公司
* FileName: BlogArticleStatistics
* Author: wangyetao
* Date: 21-12-27 05:20:10
* Description: 博主博客文章统计
* <p>
* History:
* <author> 作者姓名
* <time> 修改时间
* <version> 版本号
* <desc> 版本描述
*
* <author> wangyetao
* <time> 2021年 12月 27日 星期一 07:18:11 CST
* <version> 版本号
* <desc> 最近一次修改
*/
package simple.call.blogstar;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import simple.call.util.StringUtil;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* @ClassName: BlogArticleStatistics
* @Description: java类描述
* @Author: wangyetao
* @Date: 21-12-27 05:20:10
*/
public class BlogArticleStatistics {
private static String outPutPath = "/home/wangyetao/IdeaProjects/_psimplemvn/src/main/java/simple/call/blogstar/";
private static String filename = "u014132947";
private static String sheetname = "article_" + filename;
private static String suffix = ".xlsx";
private static FileOutputStream outputStream;
private static ArrayList<Article> blogArticles;
private static String url = "https://blog.youkuaiyun.com/u014132947";//博主url
//测试用例
public static void main(String[] args) throws InterruptedException {
//预先设置驱动
System.setProperty("webdriver.chrome.driver", "/usr/bin/chromedriver");
// Initialize your driver as you normally would:
ChromeDriver driver = new ChromeDriver();
driver.get(url);
//<div class="article-list">
//
//单条文章数据DOM结构
//<div class="article-item-box csdn-tracking-statistics" data-articleid="122148075">
// <h4 class="">
// <a href="https://blog.youkuaiyun.com/u014132947/article/details/122148075"
// data-report-click="{"spm":"1001.2014.3001.5190"}" target="_blank">
// <span class="article-type type-1 float-none">原创</span>
// 获取世界人口排名2021
// </a>
// </h4>
// <p class="content">
// 获取世界人口排名2021,Linux配置Selenium+Chrome+Java实现自动化测试
// </p>
// <div class="info-box d-flex align-content-center">
// <p>
// <span class="date">2021-12-26 06:16:59</span>
// <span class="read-num"><img src="https://csdnimg.cn/release/blogv2/dist/pc/img/readCountWhite.png" alt="">105</span>
// </p>
// </div>
// <div class="opt-box">
// <button class="btn-opt" data-type="top">置顶</button>
// <a class="btn-opt" data-type="edit" href="https://editor.youkuaiyun.com/md?articleId=122148075">编辑</a>
// </div>
//</div>
//
//<div/>
blogArticles = new ArrayList<Article>();
//稍等页面渲染完成
Thread.sleep(2000);
//nextElement
WebElement nextElement = driver.findElement(By.className("js-page-next"));
int dataNum = Integer.valueOf(driver.findElement(By.id("container-header-blog")).getAttribute("data-num"));
while (nextElement != null && blogArticles.size() < dataNum) {
List<WebElement> search_results = driver.findElements(By.className("article-item-box"));
for (int i = 0; i < search_results.size(); i++) {
WebElement element = search_results.get(i);
Article article = new Article();
//文章标题
article.title = element.findElement(By.tagName("a")).getText();
//简要内容
article.content = element.findElement(By.className("content")).getText();
//发布时间
article.publishTime = element.findElement(By.className("date")).getText();
//访问数
article.readNum = StringUtil.getInts(element.findElement(By.className("read-num")).getText())[0];
blogArticles.add(article);
}
nextElement.click();
//稍等页面渲染完成
Thread.sleep(3000);
nextElement = driver.findElement(By.className("js-page-next"));
}
driver.close();
ArrayList<String> heads = new ArrayList<String>();
heads.add("文章标题");
heads.add("简要内容");
heads.add("发布时间");
heads.add("访问数");
//CSVUtils.createCSVFile(heads, blogArticles, outPutPath, filename);
System.out.println("Creating excel");
try {
XSSFWorkbook workbook = new XSSFWorkbook();
XSSFSheet sheet = workbook.createSheet(sheetname);
//设置列宽
for (int i = 0; i < heads.size(); i++) {
if (i == 3) {
sheet.setColumnWidth(i, 6 * 256);
} else {
sheet.setColumnWidth(i, 15 * 256);
}
}
Row row = null;
Cell cell = null;
//插入第一行数据的表头
//创建第一行
row = sheet.createRow(0);
for (int i = 0; i < heads.size(); i++) {
cell = row.createCell(i);
cell.setCellValue(heads.get(i));
}
int rowNum = 1;
int colNum = 0;
//组合表格:行、列
for (Article article : blogArticles) {
row = sheet.createRow(rowNum++);
cell = row.createCell(colNum++);
cell.setCellValue(article.title);
cell = row.createCell(colNum++);
cell.setCellValue(article.content);
cell = row.createCell(colNum++);
cell.setCellValue(article.publishTime);
cell = row.createCell(colNum++);
cell.setCellValue(article.readNum);
colNum = 0;
}
outputStream = new FileOutputStream(outPutPath + filename + suffix);
//写入数据到Excel
workbook.write(outputStream);
//关闭流
outputStream.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("Done");
}
}
采集样例
2021线上评分[入围名单TOP100]数据采集|样例程序
/**
* Copyright (C), 2000-2021, XXX有限公司
* FileName: Blogstar2021
* Author: wangyetao
* Date: 2022年 01月 08日 星期六 21:49:17 CST
* Description: 线上评分[入围名单TOP100]数据采集,输出blogstar2021.xlsx
* <p>
* History:
* <author> 作者姓名
* <time> 修改时间
* <version> 版本号
* <desc> 版本描述
*
* <author> wangyetao
* <time> 2022年 01月 08日 星期六 22:03:26 CST
* <version> 版本号
* <desc> 最近一次修改
*/
package simple.call.blogstar;
import org.apache.poi.common.usermodel.HyperlinkType;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CreationHelper;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFHyperlink;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import simple.call.util.StringUtil;
import simple.call.util.TimeUtil;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
/**
* @ClassName: Blogstar2021
* @Description: java类描述
* @Author: wangyetao
* @Date: 21-12-28 15:50:02
*/
public class Blogstar2021_02 {
private static String outPutPath = "/home/wangyetao/IdeaProjects/_psimplemvn/src/main/java/simple/call/blogstar/";
//预先在指定目录创建好blogstar2021.xlsx Excel文件,防止FileNotFoundException
private static String filename = "blogstar2021";
private static String sheetname = TimeUtil.getStampToString(System.currentTimeMillis(), "yyyyMMddHHmmss");
private static String suffix = ".xlsx";
//用于保留Excel中的原内容
private static FileInputStream inputStream;
//用于往Excel中追加写入新内容
private static FileOutputStream outputStream;
private static ArrayList<BlogStar> blogStars;
private static String url = "https://www.youkuaiyun.com/blogstar2021";//blogstar2021 url
//测试用例
public static void main(String[] args) throws InterruptedException {
//预先设置驱动
System.setProperty("webdriver.chrome.driver", "/usr/bin/chromedriver");
// Initialize your driver as you normally would:
ChromeDriver driver = new ChromeDriver();
driver.get(url);
blogStars = new ArrayList<BlogStar>();
//稍等页面渲染完成
Thread.sleep(3000);
List<WebElement> lis = driver.findElement(By.className("authorscoring-nav")).findElements(By.tagName("li"));
for (int i = 0; i < lis.size(); i++) {
WebElement element = lis.get(i);
element.click();
Thread.sleep(2000);
List<WebElement> boxs = driver.findElements(By.className("authorscoring-cont-box"));
for (int j = 0; j < boxs.size(); j++) {
WebElement box = boxs.get(j);
BlogStar blogStar = new BlogStar();
//领域
blogStar.field = element.getText();
//博主简称
blogStar.name = box.findElement(By.className("name")).getText();
//排名
List<WebElement> dts = box.findElements(By.tagName("dt"));
blogStar.ranking = dts.get(0).getText();
//分数
blogStar.score = StringUtil.getInts(dts.get(1).getText())[0];
//评分页
blogStar.scorePage = box.findElements(By.tagName("a")).get(2).getAttribute("href");
//blogUrl
blogStar.blogUrl = box.findElement(By.tagName("a")).getAttribute("href");
//录入时间
blogStar.createTime = TimeUtil.getStampToString(System.currentTimeMillis(), "yyyy-MM-dd HH:mm:ss");
blogStars.add(blogStar);
}
lis = driver.findElement(By.className("authorscoring-nav")).findElements(By.tagName("li"));
}
driver.close();
ArrayList<String> heads = new ArrayList<String>();
heads.add("领域");
heads.add("博主简称");
heads.add("领域排名");
heads.add("总评分");
heads.add("参赛互动页");
heads.add("博主首页");
heads.add("录入时间");
//CSVUtils.createCSVFile(heads, blogStars, outPutPath, filename);
System.out.println("Creating excel");
try {
File file = new File(outPutPath + filename + suffix);
inputStream = new FileInputStream(file);
XSSFWorkbook workbook = new XSSFWorkbook(inputStream);
XSSFSheet sheet = workbook.createSheet(sheetname);
//设置列宽
sheet.setColumnWidth(0, 16 * 256);
sheet.setColumnWidth(1, 20 * 256);
sheet.setColumnWidth(2, 10 * 256);
sheet.setColumnWidth(3, 10 * 256);
sheet.setColumnWidth(4, 20 * 256);
sheet.setColumnWidth(5, 20 * 256);
sheet.setColumnWidth(6, 25 * 256);
Row row = null;
Cell cell = null;
//插入第一行数据的表头
//创建第一行
row = sheet.createRow(0);
for (int i = 0; i < heads.size(); i++) {
cell = row.createCell(i);
cell.setCellValue(heads.get(i));
}
int rowNum = 1;
int colNum = 0;
//组合表格:行、列
for (BlogStar blogStar : blogStars) {
row = sheet.createRow(rowNum++);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.field);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.name);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.ranking);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.score);
cell = row.createCell(colNum++);
CreationHelper createHelper = workbook.getCreationHelper();
XSSFHyperlink link = (XSSFHyperlink) createHelper.createHyperlink(HyperlinkType.URL);
link.setAddress(blogStar.scorePage);
cell.setHyperlink(link);
cell.setCellValue(blogStar.scorePage);
cell = row.createCell(colNum++);
XSSFHyperlink link2 = (XSSFHyperlink) createHelper.createHyperlink(HyperlinkType.URL);
link2.setAddress(blogStar.blogUrl);
cell.setHyperlink(link2);
cell.setCellValue(blogStar.blogUrl);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.createTime);
colNum = 0;
}
outputStream = new FileOutputStream(file);
//写入数据到Excel
workbook.write(outputStream);
//关闭流
inputStream.close();
outputStream.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("Done");
}
}
采集样例
作于2021年 12月 27日 星期一 04:02:17 CST,归档于2021年 12月 27日 星期一 20:48:42 CST。