目录
1、下载tmdb数据下载地址:TMDB 5000 Movie Dataset | Kaggle
2、下载两个文档(tmdb_5000_credits.csv和tmdb_5000_movies.csv)
1、下载tmdb数据
下载地址:TMDB 5000 Movie Dataset | Kaggle
注意:注册账号需要翻墙才可以
2、下载两个文档(tmdb_5000_credits.csv和tmdb_5000_movies.csv)
整合文档将Excel进行整合,将需要的字段整合到move.csv
3、项目POM文件
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.1.6.RELEASE</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<groupId>com.imooc</groupId>
<artifactId>dianping</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>dianping</name>
<description>dianping spring boot java project</description>
<properties>
<java.version>1.8</java.version>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compile.target>1.8</maven.compile.target>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-aop</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch</artifactId>
<version>7.3.0</version>
</dependency>
<dependency>
<groupId>org.elasticsearch.client</groupId>
<artifactId>transport</artifactId>
<version>7.3.0</version>
</dependency>
<dependency>
<groupId>org.elasticsearch.plugin</groupId>
<artifactId>transport-netty4-client</artifactId>
<version>7.3.0</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.58</version>
</dependency>
<dependency>
<groupId>com.opencsv</groupId>
<artifactId>opencsv</artifactId>
<version>4.2</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
</plugins>
</build>
</project>
4、导入ES代码
package com.imooc.dianping.service.impl;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.imooc.dianping.service.SellerService;
import com.opencsv.CSVReader;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.bulk.BulkRequest;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.xcontent.XContentType;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import org.springframework.util.StringUtils;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.List;
@Service
public class SellerServiceImpl implements SellerService {
@Autowired
TransportClient transportClient;
@Override
public void importData() {
BulkRequest bulkRequest = new BulkRequest();
int lineIndex = 0;
try {
InputStreamReader reader = new InputStreamReader(new FileInputStream("./tmdb_5000_movies.csv"), Charset.forName("UTF-8"));
CSVReader csvReader = new CSVReader(reader, ',');
//读取CSV文件
List<String[]> allReader = csvReader.readAll();
for (String[] records :
allReader) {
lineIndex++;
if (lineIndex == 1) {
continue;
}
System.out.println("第"+lineIndex+"行");
if(StringUtils.isEmpty(records[20])){
continue;
}
if(records[20].contains("[]")){
continue;
}
JSONArray jsonArray = JSONArray.parseArray(records[20]);
//获取文档字段
String character = jsonArray.getJSONObject(0).getString("character");
String name = jsonArray.getJSONObject(0).getString("name");
JSONObject cast = new JSONObject();
cast.put("character", character);
cast.put("name", name);
String date = records[11];
if (StringUtils.isEmpty(date)) {
date = "1970/01/01";
}
bulkRequest.add(new IndexRequest("movie", "_doc", String.valueOf(lineIndex)).source(XContentType.JSON,
"title", records[17]
, "tagline", records[16]
, "release_date", date
, "popularity", records[8]
, "cast", cast
, "overview", records[7]));
}
reader.close();
//将数据导入ES
transportClient.bulk(bulkRequest, new ActionListener<BulkResponse>() {
@Override
public void onResponse(BulkResponse bulkItemResponses) {
System.out.println(bulkItemResponses);
}
@Override
public void onFailure(Exception e) {
}
});
} catch (FileNotFoundException e) {
System.out.println("第---->"+lineIndex+"行");
e.printStackTrace();
} catch (IOException e) {
System.out.println("第---->"+lineIndex+"行");
e.printStackTrace();
}
}
}
5、新建ES索引
PUT /movie
{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 1
},
"mappings": {
"properties": {
"title": {
"type": "text",
"analyzer": "english"
},
"tagline": {
"type": "text",
"analyzer": "english"
},
"release_date": {
"type": "date",
"format": "8yyyy-MM-dd||yyyy-M-dd||yyyy-MM-d||yyyy-M-d"
},
"popularity": {
"type": "double"
},
"cast": {
"type": "object",
"properties": {
"character": {
"type": "text",
"analyzer": "standard"
},
"name": {
"type": "text",
"analyzer": "standard"
}
}
},
"overview": {
"type": "text",
"analyzer": "english"
}
}
}
}
为了方便导入,源码和文档进行了整合,代码如下