选型:
hadoop 2.9.0
hive 2.3.4
flume 1.8.0
1,创建hive表
分区
分桶
压缩
事务支持
create table if not exists radiusc(
UserName string,
PublicIP string,
PrivateIP string,
BeginPort string,
EndPort string,
OnlineState string,
OnlineTime string,
OfflineTime string,
MacAddress string,
OfflineReason string
)
comment 'adr information'
partitioned by (day string, hour string)
CLUSTERED BY (PublicIP) INTO 4 BUCKETS
row format delimited fields terminated by '|' lines terminated by '\n'
stored as ORC TBLPROPERTIES ("orc.compress"="SNAPPY",'transactional'='true');
2,hive-site.xml
事务支持
#<property> <name>hive.support.concurrency</name> <value>true</value> </property>
#<property> <name>hive.enforce.bucketing</name> <value>true</value> </property>
#<property> <name>hive.exec.dynamic.partition.mode</name> <value>nonstrict</value> </property>
#<property> <name>hive.txn.manager</name> <value>org.apache.hadoop.hive.ql.lockmgr.DbTxnManager</value> </property>
#<property> <name>hive.compactor.initiator.on</name> <value>true</value> </property>
#<property> <name>hive.compactor.worker.threads</name> <value>1</value> </property>
mysql存储schema信息
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://slaver3:3306/hive?createDatabaseIfNotExist=true</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>root</value>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>123456</value>
</property>
3,flume.config
# flume-ng agent -n agent2hiveradius -c conf -f /root/flume.config --classpath /home/hadoop/hive-2.3.4/lib/*:/home/hadoop/hive-2.3.4/hcatalog/share/hcatalog/* -Dflume.root.logger=INFO,console
#hive需要支持事务
# agent2hiveradius
agent2hiveradius.sources = xdrss
agent2hiveradius.sinks = hivesk
agent2hiveradius.channels = memcn
# Describe/configure the source
agent2hiveradius.sources.xdrss.type = spooldir
agent2hiveradius.sources.xdrss.spoolDir = /root/testdatas
#agent2hiveradius.sources.xdrss.deletePolicy=immediate
#interceptor
agent2hiveradius.sources.xdrss.interceptors = i1
agent2hiveradius.sources.xdrss.interceptors.i1.type = timestamp
# Describe the sink
agent2hiveradius.sinks.hivesk.type = hive
agent2hiveradius.sinks.hivesk.hive.metastore = thrift://hdmaster:9083
agent2hiveradius.sinks.hivesk.hive.database = jsltdpi
agent2hiveradius.sinks.hivesk.hive.table = radiusc
agent2hiveradius.sinks.hivesk.hive.partition =%Y%m%d,%H
agent2hiveradius.sinks.hivesk.useLocalTimeStamp = false
agent2hiveradius.sinks.hivesk.round = true
agent2hiveradius.sinks.hivesk.roundValue = 10
agent2hiveradius.sinks.hivesk.roundUnit = minute
agent2hiveradius.sinks.hivesk.serializer = DELIMITED
agent2hiveradius.sinks.hivesk.serializer.delimiter = "|"
agent2hiveradius.sinks.hivesk.serializer.serdeSeparator = '|'
agent2hiveradius.sinks.hivesk.serializer.fieldnames =username,publicip,privateip,beginport,endport,onlinestate,onlinetime,offlinetime,macaddress,offlinereason
# Use a channel which buffers events in memory
agent2hiveradius.channels.memcn.type = memory
agent2hiveradius.channels.memcn.capacity = 100000
agent2hiveradius.channels.memcn.transactionCapacity = 100000
agent2hiveradius.channels.memcn.byteCapacity = 2000000000
# Bind the source and sink to the channel
agent2hiveradius.sources.xdrss.channels = memcn
agent2hiveradius.sinks.hivesk.channel = memcn
4,还是有不少坑的,学习的话用Apache的可以,生产还是用Cloudera的吧