1.启动HDFS集群 start-all.sh
2.启动Zookeeper 脚本启动 sh zkmanager.sh start
3.启动Nginx cd /usr/local/nginx/sbin/ 执行 ./nginx
/usr/local/nginx/conf
采集的日志文件格式为
log_format main '$remote_addr,$remote_user,$time_local';
补充:修改Ngxnx配置文件 cd /usr/local/nginx/conf/
vi nginx.conf
upstream frame-tomcat {
server hdp-4:9090 ;
}
server {
listen 80;
server_name hdp-1;
#charset koi8-r;
access_log logs/log.frame.access.log main;
location / {
# root html;
# index index.html index.htm;
proxy_pass http://frame-tomcat;
}
error_page 500 502 503 504 /50x.html;
location = /50x.html {
root html;
}
}
4.启动Flume 由于以及配置环境变量
补充:修改Flume配置文件
cd apps/flume-1.6.0/conf/ vi tail-kafka.conf
a1.sources = source1
a1.sinks = k1
a1.channels = c1
a1.sources.source1.type = exec
a1.sources.source1.command = tail -F /usr/local/nginx/logs/log.frame.access.log
# Describe the sink
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.topic = first
a1.sinks.k1.brokerList = hdp-1:9092,hdp-2:9092,hdp-3:9092
a1.sinks.k1.requiredAcks = 1
a1.sinks.k1.batchSize = 20
# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
# Bind the source and sink to the channel
a1.sources.source1.channels = c1
a1.sinks.k1.channel = c1
$FLUME_HOME/bin/flume-ng agent \
-c conf \
-n a1 \
-f $FLUME_HOME/conf/tail-kafka.conf \
-Dflume.root.logger=DEBUG,console
5.启动kafka(每台有zookeeper的机器都要启动)
cd apps/kafka_2.12-2.2.0/bin/
启动kafka
./kafka-server-start.sh ../config/server.properties &
创建一个会话
./kafka-topics.sh --zookeeper hdp-1:2181 --create --replication-factor 3 --partitions 1 --topic first
创建一个生产者
./kafka-console-producer.sh --broker-list hdp-1:9092 --topic first
创建一个消费者
./kafka-console-consumer.sh --bootstrap-server hdp-1:9092 --from-beginning --topic first
6.在IDEA中学一个消费者类 ConsumerDemo
package com.stu.consumer;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import java.util.Collections;
import java.util.Properties;
public class ConsumerDemo {
private static KafkaConsumer<String, String> consumer;
private static Properties props;
static {
props = new Properties();
//消费者kafkka地址
props.put("bootstrap.servers", "hdp-1:9092");
//key反序列化
props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
//组
props.put("group.id", "yangk");
}
/**
* 从kafka中获取数据(SpringBoot也集成了kafka)
*/
private static void ConsumerMessage() {
//允许自动提交位移
props.put("enable.auto.commit", true);
consumer = new KafkaConsumer<String, String>(props);
consumer.subscribe(Collections.singleton("first"));
//使用轮询拉取数据--消费完成之后会根据设置时长来清除消息,被消费过的消息,如果想再次被消费,可以根据偏移量(offset)来获取
try {
while (true) {
//从kafka中读到了数据放在records中
ConsumerRecords<String, String> records = consumer.poll(100);
for (ConsumerRecord<String, String> r : records) {
System.out.printf("topic = %s, offset = %s, key = %s, value = %s", r.topic(), r.offset(),
r.key(), r.value());
}
}
} finally {
consumer.close();
}
}
public static void main(String[] args) {
ConsumerMessage();
}
}
7.通过java代码将日志文件上传到hdfs上
先将kafka得到的数据写到一个本地文件中,再将本地文件上传到hdfs上(过程繁琐,只为自己理解用)
创建maven项目实现
package com.stu.kafkatohtml;
import com.stu.producers.ProducerDemo2;
import java.io.*;
public class ConsumerDemo {
/**
* 项目需求:将Kafka采集的数据先先写在本地机器上,再将本地文件上传到hdfs上
* 补充:本地文件存储数据达到10一定数量时,将文件上传给hdfs上,在创建一个新文件用来存储数据
* 1.先创建一个本地文件夹
* 2.将数据写到文件中
* 3.当到达一定时间后将文件上传到hdfs上(定时器)
*
* @param args
*/
public static void main(String[] args) throws Exception {
MakdirWindows();
}
private static void MakdirWindows() throws IOException {
/**
* 在本地创建一个文件夹,在文件夹中也创建一个文件,向文件中写入内容
*/
//1. 文件夹的路径 文件名
String directory = "E:/test";
// SimpleDateFormat sdf = new SimpleDateFormat("yyyy_MM_dd_HH_MM_SS");
//2. 创建文件夹对象 创建文件对象
File file = new File(directory);
//如果文件夹不存在 就创建一个空的文件夹
if (!file.exists()) {
file.mkdirs();
}
//1048576
int i = 1;
String format = "/kafka-" + i + ".txt";
File file2 = new File(directory, format);
long length = file2.length();
if (length < 10){
//如果文件不存在 就创建一个空的文件
if (!file2.exists()) {
file2.createNewFile();
}
}
System.out.println(file2.getName()+"文件大小为:" + length);
//3.向文件中追加数据
BufferedWriter bfw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file2, true)));
String srt= "Hello";
bfw.write(srt+"\r\n");
System.out.println("文件输入成功");
bfw.close();
}
}
定时器
package com.stu.time;
import java.util.Timer;
public class TimerTest {
public static void main(String[] args) {
Timer timer = new Timer();
Timer1 timer1 = new Timer1();
timer.schedule(timer1,1000,30 * 60 * 1000);
}
}
package com.stu.time;
import com.stu.kafkatohtml.HtmlDemo;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.TimerTask;
public class Timer1 extends TimerTask {
@Override
public void run() {
SimpleDateFormat sdm=new SimpleDateFormat("yyyy年MM月dd日 HH点:mm分:ss:秒");
String format = sdm.format(new Date());
new HtmlDemo().dirtohdfs();
}
}
package com.stu.kafkatohtml;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.IOException;
import java.net.URI;
public class HtmlDemo {
public static void dirtohdfs() {
URI uri = null;
FileSystem fs = null;
try {
uri = new URI("hdfs://hdp-1:9000");
Configuration conf = new Configuration();
conf.set("dfs.replication", "2");
conf.set("dfs.blocksize", "64m");
String user = "root";
fs = FileSystem.get(uri, conf, user);
//4.向hdfs中传数据
Path src = new Path("E:/test/test.txt");
Path dst = new Path("/test");
fs.copyFromLocalFile(src, dst);
System.out.println("HDFS文件上传完成!");
} catch (Exception e) {
e.printStackTrace();
}finally {
try {
fs.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
使用flume配置将web工程产生的日志文件上传到hdfs上
web产生日志文件--Nginx采集日志文件存放到指定目录--Flume将采集的日志文件做为生产者给kafka--Flume做为消费者将数据写到hdfs上。
flume-kafka.conf
a1.sources = source1
a1.sinks = k1
a1.channels = c1
a1.sources.source1.type = exec
a1.sources.source1.command = tail -F /usr/local/nginx/logs/log.frame.access.log
# Describe the sink
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.topic = first
a1.sinks.k1.brokerList = hdp-1:9092,hdp-2:9092,hdp-3:9092
a1.sinks.k1.requiredAcks = 1
a1.sinks.k1.batchSize = 20
# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
# Bind the source and sink to the channel
a1.sources.source1.channels = c1
a1.sinks.k1.channel = c1
kafka-hdfs.conf
#source的名字
agent.sources = kafkaSource
# channels的名字,建议按照type来命名
agent.channels = memoryChannel
# sink的名字,建议按照目标来命名
agent.sinks = hdfsSink
# 指定source使用的channel名字
agent.sources.kafkaSource.channels = memoryChannel
# 指定sink需要使用的channel的名字,注意这里是channel
agent.sinks.hdfsSink.channel = memoryChannel
#-------- kafkaSource相关配置-----------------
# 定义消息源类型
agent.sources.kafkaSource.type = org.apache.flume.source.kafka.KafkaSource
# 定义kafka所在zk的地址
#
# 这里特别注意: 是kafka的zookeeper的地址
#
agent.sources.kafkaSource.zookeeperConnect = hdp-1:2181,hdp-2:2181,hdp-3:2181
# 配置消费的kafka topic
agent.sources.kafkaSource.topic = first
# 配置消费者组的id
#agent.sources.kafkaSource.groupId = flume
# 消费超时时间,参照如下写法可以配置其他所有kafka的consumer选项。注意格式从kafka.xxx开始是consumer的配置属性
agent.sources.kafkaSource.kafka.consumer.timeout.ms = 100
#------- memoryChannel相关配置-------------------------
# channel类型
agent.channels.memoryChannel.type = memory
# channel存储的事件容量
agent.channels.memoryChannel.capacity=10000
# 事务容量
agent.channels.memoryChannel.transactionCapacity=1000
#---------hdfsSink 相关配置------------------
agent.sinks.hdfsSink.type = hdfs
# 注意, 我们输出到下面一个子文件夹datax中
agent.sinks.hdfsSink.hdfs.path = hdfs://hdp-1:9000/user/hive/warehouse/hdp_1_hive.db/t_kafka_hive
agent.sinks.hdfsSink.hdfs.writeFormat = Text
agent.sinks.hdfsSink.hdfs.fileType = DataStream
agent.sinks.hdfsSink.hdfs.rollSize = 1024
agent.sinks.hdfsSink.hdfs.rollCount = 0
agent.sinks.hdfsSink.hdfs.rollInterval = 60
#配置前缀和后缀
agent.sinks.hdfsSink.hdfs.filePrefix=test
agent.sinks.hdfsSink.hdfs.fileSuffix=.data
#避免文件在关闭前使用临时文件
agent.sinks.hdfsSink.hdfs.inUserPrefix=_
agent.sinks.hdfsSink.hdfs.inUserSuffix=
#自定义拦截器
#agent.sources.kafkaSource.interceptors=i1
#agent.sources.kafkaSource.interceptors.i1.type=com.hadoop.flume.FormatInterceptor$Builder
启动flume
/root/apps/flume-1.7.0/bin/flume-ng agent \
-c conf \
-n flumeagent1 \
-f /root/apps/flume-1.7.0/confdir/kafka-hive.conf \
-Dflume.root.logger=DEBUG,console
topic 时生产者与消费者链接的关键
hive数据库创建表
create table t_kafka_hive(ip string, user_name string, user_local string)
row format delimited
fields terminated by ',';