1、业务背景
业务需要将2.5个亿左右的用户数据大约5亿条数据,导入到redis集群中,原本写的hive udf使用的是 如下写法:
jedis.set(key,value);
jedis.expire(key, 7*86400);
写数据的过程非常缓慢,根据当前的集群状态速度要写40个小时。
2、解决方案
上完搜索文档发现,redis支持pipeline的方式可以快速写数据,但是本身JedisCluster是不支持pipeline。于是对JedisCluster与Pipeline进行改造结合。
核心思想就是:根据int slot = JedisClusterCRC16.getSlot(key); 将key与solt绑定,根据solt获取丢应的redis节点,然后建立pipeline。
(1)继承JedisCluster
class JedisSlotAdvancedConnectionHandler extends JedisSlotBasedConnectionHandler {
public JedisSlotAdvancedConnectionHandler(Set<HostAndPort> nodes, GenericObjectPoolConfig poolConfig, int connectionTimeout, int soTimeout) {
super(nodes, poolConfig, connectionTimeout, soTimeout);
}
public JedisPool getJedisPoolFromSlot(int slot) {
JedisPool connectionPool = cache.getSlotPool(slot);
if (connectionPool != null) {
// It can't guaranteed to get valid connection because of node
// assignment
return connectionPool;
} else {
renewSlotCache(); //It's abnormal situation for cluster mode, that we have just nothing for slot, try to rediscover state
connectionPool = cache.getSlotPool(slot);
if (connectionPool != null) {
return connectionPool;
} else {
throw new JedisNoReachableClusterNodeException("No reachable node in cluster for slot " + slot);
}
}
}
}
class JedisClusterPipeline extends JedisCluster {
public JedisClusterPipeline(Set<HostAndPort> jedisClusterNode, int connectionTimeout, int soTimeout, int maxAttempts, final GenericObjectPoolConfig poolConfig) {
super(jedisClusterNode,connectionTimeout, soTimeout, maxAttempts, poolConfig);
super.connectionHandler = new JedisSlotAdvancedConnectionHandler(jedisClusterNode, poolConfig,
connectionTimeout, soTimeout);
}
public JedisSlotAdvancedConnectionHandler getConnectionHandler() {
return (JedisSlotAdvancedConnectionHandler)this.connectionHandler;
}
/**
* 刷新集群信息,当集群信息发生变更时调用
* @param
* @return
*/
public void refreshCluster() {
connectionHandler.renewSlotCache();
}
}
(2)构建批量写入代码
传入的批量数据格式时候 Map<String,Map<String,String>> 外层key与对应的map-key是一致的。写其他格式,进行对应的改造。
class JedisPipeline{
private static String VALUE = "value";
private JedisClusterPipeline jedisClusterPipeline;
private JedisSlotAdvancedConnectionHandler jedisSlotAdvancedConnectionHandler;
//JedisCluster Pipeline 批量写入测试
public void clusterPipeline(Set<HostAndPort> nodes) {
// Jedis连接池配置
JedisPoolConfig jedisPoolConfig = new JedisPoolConfig();
// 最大空闲连接数, 默认8个
jedisPoolConfig.setMaxIdle(100);
// 最大连接数, 默认8个
jedisPoolConfig.setMaxTotal(500);
//最小空闲连接数, 默认0
jedisPoolConfig.setMinIdle(0);
// 获取连接时的最大等待毫秒数(如果设置为阻塞时BlockWhenExhausted),如果超时就抛异常, 小于零:阻塞不确定的时间, 默认-1
jedisPoolConfig.setMaxWaitMillis(2000); // 设置2秒
//对拿到的connection进行validateObject校验
jedisPoolConfig.setTestOnBorrow(true);
jedisClusterPipeline = new JedisClusterPipeline(nodes, 2000, 2000,10, jedisPoolConfig);
jedisSlotAdvancedConnectionHandler = jedisClusterPipeline.getConnectionHandler();
}
@SuppressWarnings("rawtypes")
public void set(Map<String, Map> map, int expireTime) {
Map<JedisPool, List<String>> poolKeys = new HashMap<>();
for(String key : map.keySet()) {
int slot = JedisClusterCRC16.getSlot(key);
JedisPool jedisPool = jedisSlotAdvancedConnectionHandler.getJedisPoolFromSlot(slot);
if (poolKeys.keySet().contains(jedisPool)){
List<String> keys = poolKeys.get(jedisPool);
keys.add(key);
}else {
List<String> keys = new ArrayList<>();
keys.add(key);
poolKeys.put(jedisPool, keys);
}
}
//查询出 key 所在slot ,通过 slot 获取 JedisPool ,将key 按 JedisPool 分组
jedisClusterPipeline.refreshCluster();
//调用Jedis pipeline进行单点批量写入
for (JedisPool jedisPool : poolKeys.keySet()) {
Jedis jedis = jedisPool.getResource();
Pipeline pipeline = jedis.pipelined();
List<String> keys = poolKeys.get(jedisPool);
for(String key : keys) {
String value = map.get(key).get(VALUE).toString();
pipeline.set(key, value);
pipeline.expire(key, expireTime);
}
pipeline.sync();//同步提交
jedis.close();
}
}
}
3、完整hive udf应用
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.pool2.impl.GenericObjectPoolConfig;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils;
import org.apache.hadoop.io.IntWritable;
import redis.clients.jedis.HostAndPort;
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisCluster;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;
import redis.clients.jedis.JedisSlotBasedConnectionHandler;
import redis.clients.jedis.Pipeline;
import redis.clients.jedis.exceptions.JedisNoReachableClusterNodeException;
import redis.clients.util.JedisClusterCRC16;
public class HiveToRedisSetPipline extends GenericUDF{
private ObjectInspector[] inputOI;
@SuppressWarnings("rawtypes")
private Map<String, Map> cachedMap = new HashMap<String, Map>();
private static int EXPIRE_TIME = 604800;
private static String KEY = "key";
private JedisPipeline pipeline = null;
private IntWritable result;
@Override
public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
if (arguments.length != 4) {
throw new UDFArgumentException(" Expecting 4 arguments");
}
this.inputOI = arguments;
return PrimitiveObjectInspectorFactory.writableIntObjectInspector;
}
@Override
public Object evaluate(DeferredObject[] arguments) throws HiveException {
Map<?, ?> map = ((MapObjectInspector) this.inputOI[0]).getMap(arguments[0].get());
String hostAndPorts = PrimitiveObjectInspectorUtils.getString(arguments[1].get(), (PrimitiveObjectInspector) this.inputOI[1]);
int expireTime = EXPIRE_TIME = PrimitiveObjectInspectorUtils.getInt(arguments[2].get(), (PrimitiveObjectInspector) this.inputOI[2]);
int limit = PrimitiveObjectInspectorUtils.getInt(arguments[3].get(), (PrimitiveObjectInspector) this.inputOI[3]);
if(pipeline == null) {
String[] split = hostAndPorts.split(",");
Set<HostAndPort> nodes = new HashSet<HostAndPort>();
for(String s : split) {
String[] arr = s.split(":");
nodes.add(new HostAndPort(arr[0], Integer.valueOf(arr[1])));
}
pipeline = new JedisPipeline();
pipeline.clusterPipeline(nodes);
}
cachedMap.put(map.get(KEY).toString(), map);
result = new IntWritable(0);
if(cachedMap.size() >= limit) {
pipeline.set(cachedMap, expireTime);
result = new IntWritable(cachedMap.size());
cachedMap.clear();
}
return result;
}
@Override
public String getDisplayString(String[] children) {
return null;
}
@Override
public void close() throws IOException {
if(cachedMap.size() > 0 ) {
pipeline.set(cachedMap, EXPIRE_TIME);
cachedMap.clear();
}
pipeline = null;
}
}
4、实战经验
- key 长度50-70
- value 日期yyyymmdd格式
- 4千万条数据,写入5-20分钟
参考文档:Java 使用Pipeline对Redis批量读写_harvey的博客-优快云博客_java redis批量写入数据