LogEntity
public class LogEntity implements Writable {
public final static String delimiter = "\\|@@\\|";
private String remote_addr;// 记录客户端的ip地址
private String time_local;// 记录访问时间与时区
private String request;// 记录请求的url与http协议
private String exetime;// 记录请求的url与http协议
private String status;// 记录请求状态;成功是200
private String body_bytes_sent;// 记录发送给客户端文件主体内容大小
private String http_referer;// 用来记录从那个页面链接访问过来的
private String http_user_agent;// 记录客户浏览器的相关信息
private String req_body;
private boolean valid = true;// 判断数据是否合法
public String getExetime() {
return exetime;
}
public void setExetime(String exetime) {
this.exetime = exetime;
}
public String getReq_body() {
return req_body;
}
public void setReq_body(String req_body) {
this.req_body = req_body;
}
public String getRemote_addr() {
return remote_addr;
}
public void setRemote_addr(String remote_addr) {
this.remote_addr = remote_addr;
}
public String getTime_local() {
return time_local;
}
public void setTime_local(String time_local) {
this.time_local = time_local;
}
public String getRequest() {
return request;
}
public void setRequest(String request) {
this.request = request;
}
public String getStatus() {
return status;
}
public void setStatus(String status) {
this.status = status;
}
public String getBody_bytes_sent() {
return body_bytes_sent;
}
public void setBody_bytes_sent(String body_bytes_sent) {
this.body_bytes_sent = body_bytes_sent;
}
public String getHttp_referer() {
return http_referer;
}
public void setHttp_referer(String http_referer) {
this.http_referer = http_referer;
}
public String getHttp_user_agent() {
return http_user_agent;
}
public void setHttp_user_agent(String http_user_agent) {
this.http_user_agent = http_user_agent;
}
public boolean isValid() {
return valid;
}
public void setValid(boolean valid) {
this.valid = valid;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append(this.remote_addr);
sb.append("\001").append(this.time_local);
sb.append("\001").append(this.exetime);
sb.append("\001").append(this.request);
sb.append("\001").append(this.status);
sb.append("\001").append(this.body_bytes_sent);
sb.append("\001").append(this.http_referer);
sb.append("\001").append(this.http_user_agent);
sb.append("\001").append(this.req_body);
return sb.toString();
}
@Override
public void readFields(DataInput in) throws IOException {
remote_addr = in.readUTF();
time_local = in.readUTF();
exetime = in.readUTF();
request = in.readUTF();
status = in.readUTF();
body_bytes_sent = in.readUTF();
http_referer = in.readUTF();
http_user_agent = in.readUTF();
req_body = in.readUTF();
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(remote_addr);
out.writeUTF(time_local);
out.writeUTF(exetime);
out.writeUTF(request);
out.writeUTF(status);
out.writeUTF(body_bytes_sent);
out.writeUTF(http_referer);
out.writeUTF(http_user_agent);
out.writeUTF(req_body);
}
}
LogParser
public class LogParser {
static SimpleDateFormat sd1 = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss", Locale.US);
static SimpleDateFormat sd2 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
public static LogEntity parser(String line) {
LogEntity log = new LogEntity();
String[] arr = line.split(log.delimiter);
if (arr.length >= 9) {
log.setRemote_addr(arr[0]);
log.setTime_local(parseTime(arr[1]));
log.setRequest(arr[2]);
log.setExetime(arr[3]);
log.setStatus(arr[4]);
log.setBody_bytes_sent(arr[5]);
log.setHttp_referer(arr[6]);
log.setHttp_user_agent(arr[7]);
log.setReq_body(arr[8]);
if (Integer.parseInt(log.getStatus()) >= 400) {// 大于400,HTTP错误
log.setValid(false);
} else if (log.getRequest().indexOf("/api/open/files") != -1 || log.getRequest().equals("GET / HTTP/1.1")
|| log.getRequest().equals("HEAD / HTTP/1.0")) {
log.setValid(false);
}
} else {
log.setValid(false);
}
return log;
}
public static String parseTime(String dt) {
String timeString = "";
try {
Date parse = sd1.parse(dt);
timeString = sd2.format(parse);
} catch (ParseException e) {
e.printStackTrace();
}
return timeString;
}
}
Task
public class Clear2 {
static class ConverterMap extends Mapper<LongWritable, Text, NullWritable, LogEntity> {
public NullWritable v = NullWritable.get();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
LogEntity log = LogParser.parser(line);
if (log.isValid()) {
context.write(v, log);
}
}
}
static class DatePartitioner extends Partitioner<NullWritable, LogEntity> {
static Set<Integer> set = new HashSet<>();
@Override
public int getPartition(NullWritable key, LogEntity log, int numPartitions) {
int id = Integer.parseInt(log.getTime_local().substring(8, 10));
if (!set.contains(id)) {
System.out.println(log.getTime_local() + "\t" + id);
set.add(id);
}
return id % 17;
}
}
static class WriteReducer extends Reducer<NullWritable, LogEntity, NullWritable, LogEntity> {
public NullWritable v = NullWritable.get();
@Override
protected void reduce(NullWritable k, Iterable<LogEntity> logs, Context context)
throws IOException, InterruptedException {
for (LogEntity log : logs) {
context.write(k, log);
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(Clear2.class);
// 指定mapper输出数据的kv类型
job.setMapOutputKeyClass(NullWritable.class);
job.setMapOutputValueClass(LogEntity.class);
// 指定最终输出的数据的kv类型
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(LogEntity.class);
job.setMapperClass(ConverterMap.class);
job.setPartitionerClass(DatePartitioner.class);
job.setReducerClass(WriteReducer.class);
File input = new File("D:\\logs\\08");
File output = new File("D:\\logs\\08_out");
FileInputFormat.setInputPaths(job, new Path(input.getAbsolutePath()));
FileOutputFormat.setOutputPath(job, new Path(output.getAbsolutePath()));
job.setNumReduceTasks(17);
job.waitForCompletion(true);
}
}
hive命令
DROP TABLE IF EXISTS log;
create table if not exists log(
ip string,
time string,
exetime float,
request string,
status int,
body_bytes int,
referer string,
user_agent string,
terminal int,
channel string,
mobile_type string,
body string
)
row format delimited fields terminated by '\001'
stored as textfile;
load data local inpath '/home/hadoop/idongri_nginx_logs/08/08_out'
overwrite into table log;
DROP TABLE IF EXISTS log_time;
create table if not exists log_time(
ip string,
time string,
dt string,
hms string,
m int,
h int,
d int,
exetime float,
request string,
status int,
body_bytes int,
referer string,
user_agent string,
terminal int,
channel string,
mobile_type string,
body string
)
partitioned by(month string)
row format delimited fields terminated by '\001'
stored as textfile;
insert into log_time partition(month='2017-08')
select c.ip,c.time,
substring(c.time,0,10) as dt,
substring(c.time,12) as hms,
substring(c.time,6,2) as m,
substring(c.time,9,2) as d,
substring(c.time,11,3) as h,
c.exetime,
c.request,c.status,c.body_bytes,
c.referer,c.user_agent,
c.terminal,c.channel,c.mobile_type,c.body
from log c ;
alter table time_log rename to log_time;
drop table if exists pv;
create table if not exists pv(
dt string,
h int,
terminal int,
channel string,
cnt int
)
partitioned by(month string)
row format delimited fields terminated by '\001'
stored as textfile;
insert into pv partition(month='2017-08')
select dt,h,terminal,channel,count(*) from log_time group by dt,h,channel,terminal;
mysql
drop table log;
create table if not exists log(
ip varchar(20),
time varchar(20),
exetime decimal(9,2),
request text,
status int,
body_bytes int,
referer varchar(200),
user_agent varchar(300),
terminal int,
channel varchar(20),
mobile_type varchar(100),
body text
);
hive导入mysql
/home/hadoop/apps/sqoop-1.4.6/bin/sqoop export \
--connect "jdbc:mysql://192.168.199.100:3306/sakila" \
--username root \
--password root \
--table log \
--fields-terminated-by '\001' \
--export-dir /user/hive/warehouse/test.db/log;

314

被折叠的 条评论
为什么被折叠?



