idea联合es 做出jd爬虫

pom

 <dependencies>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-data-elasticsearch</artifactId>
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-web</artifactId>
        </dependency>
        <dependency>
            <groupId>org.projectlombok</groupId>
            <artifactId>lombok</artifactId>
        </dependency>
        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.62</version>
        </dependency>
<!--        爬虫-->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.11.3</version>
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-test</artifactId>
            <scope>test</scope>
        </dependency>
    </dependencies>

text

package qywwc;

import com.alibaba.fastjson.JSON;
import net.minidev.json.JSONArray;
import org.apache.lucene.search.join.QueryBitSetProducer;
import org.apache.lucene.util.QueryBuilder;
import org.elasticsearch.action.admin.indices.delete.DeleteIndexRequest;
import org.elasticsearch.action.bulk.BulkRequest;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.delete.DeleteRequest;
import org.elasticsearch.action.delete.DeleteResponse;
import org.elasticsearch.action.get.GetRequest;
import org.elasticsearch.action.get.GetResponse;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.action.search.SearchRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.action.support.master.AcknowledgedResponse;
import org.elasticsearch.action.update.UpdateRequest;
import org.elasticsearch.action.update.UpdateResponse;
import org.elasticsearch.client.Request;
import org.elasticsearch.client.RequestOptions;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.client.indices.CreateIndexRequest;
import org.elasticsearch.client.indices.CreateIndexResponse;
import org.elasticsearch.client.indices.GetIndexRequest;

import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.index.query.MatchQueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.index.query.TermQueryBuilder;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import qywwc.entity.User;

import javax.naming.directory.SearchControls;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;


@SpringBootTest
class QyWwcApplicationTests {
    @Autowired
    private RestHighLevelClient client;
    //模糊搜索
    @Test
    public void TextSearch()throws Exception{
        SearchRequest searchRequest = new SearchRequest("qy151-index");
        //创建条件对象
        SearchSourceBuilder builder = new SearchSourceBuilder();
        TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("name", "你");
        builder.query(termQueryBuilder);
//分页
        builder.from(0);
        builder.size(1);
        //排序
//        builder.sort("age");
        //高亮
        HighlightBuilder highlightBuilder=new HighlightBuilder();
        highlightBuilder.field("name");
        highlightBuilder.preTags("<font color='red'>");
        highlightBuilder.postTags("</font>");
        builder.highlighter(highlightBuilder);

        searchRequest.source(builder);
        SearchResponse search = client.search(searchRequest, RequestOptions.DEFAULT);
        System.out.println("总条数:"+search.getHits().getTotalHits().value);
        SearchHit[] hits = search.getHits().getHits();
        Arrays.stream(hits).forEach(item-> System.out.println(item.getSourceAsString()));
    }
    //批量添加
    @Test
    void testBuck()throws Exception{
        BulkRequest bulkItemResponses = new BulkRequest("qy151-index");
        List<User> users = new ArrayList<>();
        users.add(new User("2","你","上海1",151));
        users.add(new User("3","你1","上海2",152));
        users.add(new User("4","你2","上海3",153));
        users.add(new User("5","你3","上海4",154));
        users.add(new User("6","你4","上海5",5));
        //users.stream().forEach(item->bulkItemResponses.add(new IndexRequest().id(item.getId()).source(JSON.toJSONString(item),XContentType.JSON)))
        for (User user:users){
            IndexRequest indexRequest = new IndexRequest();
            indexRequest.id(user.getId());
            indexRequest.source(JSON.toJSONString(user),XContentType.JSON);
            bulkItemResponses.add(indexRequest);
        }
        BulkResponse bulk = client.bulk(bulkItemResponses,RequestOptions.DEFAULT);
        System.out.println(bulk.hasFailures());
    }
    //文档修改
    @Test
    void textUpdate()throws Exception{
        UpdateRequest updateRequest = new UpdateRequest("qy151-index", "1");
        User user = new User();
        user.setName("王文超");
        updateRequest.doc(JSON.toJSONString(user), XContentType.JSON);
        UpdateResponse update = client.update(updateRequest, RequestOptions.DEFAULT);
        System.out.println(update.getResult());
    }
    //删除文档
    @Test
    void deleteDoc()throws Exception{
        DeleteRequest deleteRequest = new DeleteRequest("qy151-index");
        deleteRequest.id("1");
        DeleteResponse delete = client.delete(deleteRequest, RequestOptions.DEFAULT);
        System.out.println(delete.getResult());
    }
    @Test
    //判断索引文档是否存在
    void textExit()throws Exception{
        GetRequest getRequest = new GetRequest("qy151-index");
        getRequest.id("1");
        boolean exists = client.exists(getRequest, RequestOptions.DEFAULT);
        System.out.println(exists);
    }
    @Test
    //获取索引
    void GetIndex() throws Exception{
        GetRequest getRequest = new GetRequest("qy151-index");
        getRequest.id("1");
        GetResponse documentFields = client.get(getRequest, RequestOptions.DEFAULT);

        System.out.println(documentFields.getSourceAsMap().get("name"));
    }
    @Test
    //添加USer
    void add() throws Exception{
        IndexRequest indexRequest = new IndexRequest("qy151-index");
        indexRequest.id("1");//指定文档id
        //USer格式json
        indexRequest.source(JSON.toJSONString(new User(null,"张三","北京",15)),XContentType.JSON);
        IndexResponse index = client.index(indexRequest, RequestOptions.DEFAULT);
        System.out.println(index.getResult());
    }



    @Test
        //查看索引是否存在
    void textExits() throws Exception{
        GetIndexRequest getIndexRequest = new GetIndexRequest("qy151-index");
        boolean exists = client.indices().exists(getIndexRequest, RequestOptions.DEFAULT);
        System.out.println(exists);
    }
    @Test
    //删除索引
    void textDeleteIndex() throws Exception{
        DeleteIndexRequest deleteIndexRequest = new DeleteIndexRequest("qy151-index");
        AcknowledgedResponse delete = client.indices().delete(deleteIndexRequest, RequestOptions.DEFAULT);
        System.out.println(delete.isAcknowledged());
    }
    @Test
    void contextLoads() throws Exception{
        //创建索引
            //该类把索引的信息封装到该类中
            CreateIndexRequest indexRequest = new CreateIndexRequest("qy151-index");
            CreateIndexResponse createIndexResponse = client.indices().create(indexRequest, RequestOptions.DEFAULT);
            System.out.println(createIndexResponse.isAcknowledged());
    }


}
HtmlParseUtils爬虫
public class HtmlParseUtils {
    public static List<Prodect> parseJd(String keyword) throws Exception{
        String path="https://search.jd.com/Search?keyword="+keyword;
        //Document整个网页对象
        Document document = Jsoup.parse(new URL(path), 30000);
        Element j_goodsList = document.getElementById("J_goodsList");
        Elements li = j_goodsList.getElementsByTag("li");
        ArrayList<Prodect> list = new ArrayList<>();
        for (Element element:li){
            String pprice = element.getElementsByClass("p-price").eq(0).text();
            String pname = element.getElementsByClass("p-name").eq(0).text();
            String pimg = element.getElementsByTag("img").eq(0).attr("data-lazy-img");
            list.add(new Prodect(pname,pprice,pimg));
        }
        return list;
    }
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值