从网站中爬取文字和图片
public class TravelSpider {
public static void main(String[] args) throws Exception {
String url = "http://www.jinmalvyou.com/search/index/view_type/1/keyword/%E5%9B%BD%E5%86%85";
fetchTravelData(url);
}
private static void fetchTravelData(String url) throws Exception {
Document document = Jsoup.connect(url).get();
Elements elements = document.select(".rl-b-li");
for (Element element : elements) {
Elements rnameElement = element.select(".pro-title>a");
String rname = rnameElement.text();
System.out.println("路线名称:" + rname);
Elements introduceElements = element.select(".pro-colomn");
Element introduceElement = introduceElements.get(0);
String routeIntroduce = introduceElement.text();
System.out.println("路线介绍:" + routeIntroduce);
Elements priceElements = element.select(".price>strong");
String price = priceElements.text();
System.out.println("路线价格:" + price);
Elements rimageElements = element.select(".pro-img img");
String rimageSrc = "http:" + rimageElements.attr("src");
String localPath = saveImage(rimageSrc);
System.out.println("路线图片:" +localPath);
System.out.println("-----------------------------------");
}
Elements nextElements = document.select("a.next");
if (nextElements != null && !nextElements.isEmpty()) {
String nextUrl = "http://www.jinmalvyou.com" + nextElements.attr("href");
fetchTravelData(nextUrl);
}
}
private static String saveImage(String rimageSrc) throws IOException {
int index = rimageSrc.lastIndexOf("/");
String rimageName = rimageSrc.substring(index + 1);
String localPath = "E:\\63\\travelImages\\" + rimageName;
CloseableHttpClient client = HttpClients.createDefault();
HttpGet get = new HttpGet(rimageSrc);
CloseableHttpResponse response = client.execute(get);
if (response.getStatusLine().getStatusCode() == 200) {
HttpEntity entity = response.getEntity();
InputStream inputStream = entity.getContent();
FileOutputStream outputStream = new FileOutputStream(localPath);
IOUtils.copy(inputStream, outputStream);
inputStream.close();
outputStream.close();
}
return localPath;
}
}