Nginx生成日志文件 Flume采集日志文件 kafka存储日志文件上传到hdfs上的hive中

最新推荐文章于 2022-07-06 01:36:05 发布

Thomas_HJW

最新推荐文章于 2022-07-06 01:36:05 发布

阅读量530

点赞数

分类专栏： hive 文章标签： kafka

本文链接：https://blog.youkuaiyun.com/Tomas_White/article/details/102554483

版权

hive 专栏收录该内容

3 篇文章

订阅专栏

本文详细介绍了一种从Web工程中收集日志文件，并利用Nginx、Flume、Kafka和HDFS进行数据处理与存储的技术流程。具体步骤包括启动HDFS、Zookeeper、Nginx等服务，配置Flume将日志数据发送至Kafka，再由Flume将Kafka数据写入HDFS。此外，还介绍了如何通过Java代码将日志数据上传至HDFS，以及使用Flume配置将日志文件直接上传至HDFS的方法。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

1.启动HDFS集群 start-all.sh

2.启动Zookeeper 脚本启动 sh zkmanager.sh start

3.启动Nginx cd /usr/local/nginx/sbin/ 执行 ./nginx

/usr/local/nginx/conf

采集的日志文件格式为

log_format  main  '$remote_addr,$remote_user,$time_local';

补充：修改Ngxnx配置文件 cd /usr/local/nginx/conf/

vi nginx.conf

 upstream frame-tomcat {
          server hdp-4:9090 ; 
    }
    server {
        listen       80;
        server_name  hdp-1;

        #charset koi8-r;

        access_log  logs/log.frame.access.log  main;

        location / {
            # root   html;
            # index  index.html index.htm;
            proxy_pass http://frame-tomcat;
        }

        error_page   500 502 503 504  /50x.html;
        location = /50x.html {
            root   html;
        }


    }

4.启动Flume 由于以及配置环境变量

补充：修改Flume配置文件

cd apps/flume-1.6.0/conf/ vi tail-kafka.conf

a1.sources = source1
a1.sinks = k1
a1.channels = c1

a1.sources.source1.type = exec
a1.sources.source1.command = tail -F /usr/local/nginx/logs/log.frame.access.log
# Describe the sink
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.topic = first
a1.sinks.k1.brokerList = hdp-1:9092,hdp-2:9092,hdp-3:9092
a1.sinks.k1.requiredAcks = 1
a1.sinks.k1.batchSize = 20

# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

# Bind the source and sink to the channel
a1.sources.source1.channels = c1
a1.sinks.k1.channel = c1

$FLUME_HOME/bin/flume-ng agent \
-c conf \
-n a1 \
-f $FLUME_HOME/conf/tail-kafka.conf \
-Dflume.root.logger=DEBUG,console

5.启动kafka(每台有zookeeper的机器都要启动)

cd apps/kafka_2.12-2.2.0/bin/

启动kafka

./kafka-server-start.sh ../config/server.properties &

创建一个会话

./kafka-topics.sh --zookeeper hdp-1:2181 --create --replication-factor 3 --partitions 1 --topic first

创建一个生产者

./kafka-console-producer.sh --broker-list hdp-1:9092 --topic first

创建一个消费者

./kafka-console-consumer.sh --bootstrap-server hdp-1:9092 --from-beginning --topic first

6.在IDEA中学一个消费者类 ConsumerDemo

package com.stu.consumer;

import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;

import java.util.Collections;
import java.util.Properties;

public class ConsumerDemo {
    private static KafkaConsumer<String, String> consumer;
    private static Properties props;

    static {
        props = new Properties();
        //消费者kafkka地址
        props.put("bootstrap.servers", "hdp-1:9092");
        //key反序列化
        props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        //组
        props.put("group.id", "yangk");
    }

    /**
     * 从kafka中获取数据（SpringBoot也集成了kafka）
     */
    private static void ConsumerMessage() {
        //允许自动提交位移
        props.put("enable.auto.commit", true);
        consumer = new KafkaConsumer<String, String>(props);
        consumer.subscribe(Collections.singleton("first"));

        //使用轮询拉取数据--消费完成之后会根据设置时长来清除消息，被消费过的消息，如果想再次被消费，可以根据偏移量(offset)来获取
        try {
            while (true) {
                //从kafka中读到了数据放在records中
                ConsumerRecords<String, String> records = consumer.poll(100);
                for (ConsumerRecord<String, String> r : records) {
                    System.out.printf("topic = %s, offset = %s, key = %s, value = %s", r.topic(), r.offset(),
                            r.key(), r.value());

                }
            }
        } finally {
            consumer.close();
        }
    }

    public static void main(String[] args) {
        ConsumerMessage();
    }
}

7.通过java代码将日志文件上传到hdfs上

先将kafka得到的数据写到一个本地文件中，再将本地文件上传到hdfs上(过程繁琐，只为自己理解用)

创建maven项目实现

package com.stu.kafkatohtml;

import com.stu.producers.ProducerDemo2;

import java.io.*;


public class ConsumerDemo {
    /**
     * 项目需求：将Kafka采集的数据先先写在本地机器上，再将本地文件上传到hdfs上
     * 补充：本地文件存储数据达到10一定数量时，将文件上传给hdfs上，在创建一个新文件用来存储数据
     * 1.先创建一个本地文件夹
     * 2.将数据写到文件中
     * 3.当到达一定时间后将文件上传到hdfs上(定时器)
     *
     * @param args
     */
    public static void main(String[] args) throws Exception {
        MakdirWindows();

    }

    private static void MakdirWindows() throws IOException {

        /**
         * 在本地创建一个文件夹，在文件夹中也创建一个文件，向文件中写入内容
         */
        //1. 文件夹的路径  文件名
        String directory = "E:/test";
//        SimpleDateFormat sdf = new SimpleDateFormat("yyyy_MM_dd_HH_MM_SS");



        //2.  创建文件夹对象     创建文件对象
        File file = new File(directory);
        //如果文件夹不存在  就创建一个空的文件夹

        if (!file.exists()) {
            file.mkdirs();
        }
        //1048576
        int i = 1;
        String format = "/kafka-" + i + ".txt";
        File file2 = new File(directory, format);
        long length = file2.length();

        if (length < 10){
            //如果文件不存在  就创建一个空的文件
            if (!file2.exists()) {
                file2.createNewFile();
            }
        }
        System.out.println(file2.getName()+"文件大小为：" + length);
        //3.向文件中追加数据
        BufferedWriter bfw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file2, true)));
        String srt= "Hello";
        bfw.write(srt+"\r\n");
        System.out.println("文件输入成功");
        bfw.close();
        
    }
}

定时器

package com.stu.time;


import java.util.Timer;

public class TimerTest {
    public static void main(String[] args) {
        Timer timer = new Timer();
        Timer1 timer1 = new Timer1();
        timer.schedule(timer1,1000,30 * 60 * 1000);
    }
}

package com.stu.time;


import com.stu.kafkatohtml.HtmlDemo;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.TimerTask;

public class Timer1 extends TimerTask {
    @Override
    public void run() {
        SimpleDateFormat sdm=new SimpleDateFormat("yyyy年MM月dd日 HH点:mm分:ss:秒");
        String format = sdm.format(new Date());

        new HtmlDemo().dirtohdfs();


    }
}

package com.stu.kafkatohtml;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.IOException;
import java.net.URI;


public class HtmlDemo {

    public static void dirtohdfs()  {
        URI uri = null;
        FileSystem fs = null;
        try {
            uri = new URI("hdfs://hdp-1:9000");
            Configuration conf = new Configuration();
            conf.set("dfs.replication", "2");
            conf.set("dfs.blocksize", "64m");
            String user = "root";
            fs = FileSystem.get(uri, conf, user);
            //4.向hdfs中传数据
            Path src = new Path("E:/test/test.txt");
            Path dst = new Path("/test");
            fs.copyFromLocalFile(src, dst);
            System.out.println("HDFS文件上传完成!");

        } catch (Exception e) {
            e.printStackTrace();
        }finally {
            try {
                fs.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

    }
}

使用flume配置将web工程产生的日志文件上传到hdfs上

web产生日志文件--Nginx采集日志文件存放到指定目录--Flume将采集的日志文件做为生产者给kafka--Flume做为消费者将数据写到hdfs上。

flume-kafka.conf

a1.sources = source1
a1.sinks = k1
a1.channels = c1

a1.sources.source1.type = exec
a1.sources.source1.command = tail -F /usr/local/nginx/logs/log.frame.access.log
# Describe the sink
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.topic = first
a1.sinks.k1.brokerList = hdp-1:9092,hdp-2:9092,hdp-3:9092
a1.sinks.k1.requiredAcks = 1
a1.sinks.k1.batchSize = 20

# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

# Bind the source and sink to the channel
a1.sources.source1.channels = c1
a1.sinks.k1.channel = c1

kafka-hdfs.conf

#source的名字
agent.sources = kafkaSource
# channels的名字，建议按照type来命名
agent.channels = memoryChannel
# sink的名字，建议按照目标来命名
agent.sinks = hdfsSink
 
# 指定source使用的channel名字
agent.sources.kafkaSource.channels = memoryChannel
# 指定sink需要使用的channel的名字,注意这里是channel
agent.sinks.hdfsSink.channel = memoryChannel
 
#-------- kafkaSource相关配置-----------------
# 定义消息源类型
agent.sources.kafkaSource.type = org.apache.flume.source.kafka.KafkaSource
# 定义kafka所在zk的地址
#
# 这里特别注意: 是kafka的zookeeper的地址
#
agent.sources.kafkaSource.zookeeperConnect = hdp-1:2181,hdp-2:2181,hdp-3:2181
# 配置消费的kafka topic
agent.sources.kafkaSource.topic = first
# 配置消费者组的id
#agent.sources.kafkaSource.groupId = flume
# 消费超时时间,参照如下写法可以配置其他所有kafka的consumer选项。注意格式从kafka.xxx开始是consumer的配置属性
agent.sources.kafkaSource.kafka.consumer.timeout.ms = 100
 
 
 
#------- memoryChannel相关配置-------------------------
# channel类型
agent.channels.memoryChannel.type = memory
# channel存储的事件容量
agent.channels.memoryChannel.capacity=10000
# 事务容量
agent.channels.memoryChannel.transactionCapacity=1000
 
#---------hdfsSink 相关配置------------------
agent.sinks.hdfsSink.type = hdfs
# 注意, 我们输出到下面一个子文件夹datax中
agent.sinks.hdfsSink.hdfs.path = hdfs://hdp-1:9000/user/hive/warehouse/hdp_1_hive.db/t_kafka_hive
agent.sinks.hdfsSink.hdfs.writeFormat = Text
agent.sinks.hdfsSink.hdfs.fileType = DataStream
 
 
agent.sinks.hdfsSink.hdfs.rollSize = 1024
agent.sinks.hdfsSink.hdfs.rollCount = 0
agent.sinks.hdfsSink.hdfs.rollInterval = 60
 
#配置前缀和后缀
agent.sinks.hdfsSink.hdfs.filePrefix=test
agent.sinks.hdfsSink.hdfs.fileSuffix=.data
 
#避免文件在关闭前使用临时文件
agent.sinks.hdfsSink.hdfs.inUserPrefix=_
agent.sinks.hdfsSink.hdfs.inUserSuffix=
 
#自定义拦截器
#agent.sources.kafkaSource.interceptors=i1
#agent.sources.kafkaSource.interceptors.i1.type=com.hadoop.flume.FormatInterceptor$Builder

启动flume

/root/apps/flume-1.7.0/bin/flume-ng agent \
-c conf \
-n flumeagent1 \
-f /root/apps/flume-1.7.0/confdir/kafka-hive.conf \
-Dflume.root.logger=DEBUG,console

topic 时生产者与消费者链接的关键

hive数据库创建表

create table t_kafka_hive(ip string, user_name string, user_local string)
row format delimited
fields terminated by ',';

Nginx生成日志文件 Flume采集日志文件 kafka存储日志文件 上传到hdfs上的hive中

Nginx生成日志文件 Flume采集日志文件 kafka存储日志文件上传到hdfs上的hive中