1 建表的问题
create external table if not exists temp (ip STRING,identd STRING,username STRING,handledate STRING,requrl STRING,status int,returnbytes int,referer string,browser STRING) row format SERDE 'org.apache.hadoop.hive.serde2.dynamic_type.DynamicSerDe'WITH SERDEPROPERTIES ('serialization.format'='org.apache.hadoop.hive.serde2.thrift.TCTLSeparatedProtocol','quote.delim'='("|\\[|\\])','field.delim'=' ') STORED AS TEXTFILE location '/logs/apache/2013-02-25';
会解析成
{"ip":"223.198.228.244","identd":"-","username":"-","handledate":"[24/Feb/2013:22:00:54","requrl":"+0800]","status":null,"returnbytes":200,"referer":"52","browser":"http://×××/index.html"}
但create external table if not exists temp (ip STRING,identd STRING,username STRING,handledate STRING,requrl STRING,status int,returnbytes int,referer string,browser STRING) row format SERDE 'org.apache.hadoop.hive.serde2.dynamic_type.DynamicSerDe'WITH SERDEPROPERTIES ('serialization.format'='org.apache.hadoop.hive.serde2.thrift.TCTLSeparatedProtocol','quote.delim'='("|\\[|\\])','field.delim'=' ','serialization.null.format'='-') STORED AS TEXTFILE location '/logs/apache/2013-02-25';
解析出来的是对的,就加上了'serialization.null.format'='-‘,不知道为什么。
{"ip":"223.198.228.244","identd":null,"username":null,"handledate":"[24/Feb/2013:22:00:54 +0800]","status":200,"returnbytes":52,"referer":"http://×××/index.html","browser":"mozilla==="}
2 hive执行语句报错的问题:
hive> select "ttt",substring(dateconvert(handledate),0,14),urlconvert(requrl),count(1) from temp group by requrl,substring(handledate,0,14);
FAILED: Error in semantic analysis: Line 1:23 Expression not in GROUP BY key 'handledate'
改成select "ttt",substring(dateconvert(handledate),0,14),urlconvert(requrl),count(1) from temp group by substring(dateconvert(handledate),0,14),requrl;
3 使用udf的问题,要把jar包放在所有hadoop节点的lib目录下。每次更新jar包了需要重新在hive语句里add,每次执行新的会话,要重新create函数。==后来发现不是需要放在所有hadoop节点。
4 insert into table result_url (select "ttt",substring(dateconvert(handledate),0,14),urlconvert(requrl),count(1) from temp group by substring(dateconvert(handledate),0,14), requrl;#--ip,
执行报错:Parse Error: line 1:29 cannot recognize input near '(' 'select' '"ttt"' in select clause
改成insert into table result_url select "ttt",substring(dateconvert(handledate),0,14),urlconvert(requrl),count(1) from temp group by substring(dateconvert(handledate),0,14), requrl;#--ip,
或者
create table result_url2 as select "ttt",substring(dateconvert(handledate),0,14),urlconvert(requrl),count(1) from temp group by substring(dateconvert(handledate),0,14), requrl;
自动创建result_url2这个表,字段名为_c0,_c1,_c2,_c3
6 修改数据库用户密码,以root身份进入mysql,use mysql; update user set Password=PASSWORD('111111') where User='hadoop';flush privileges;(必须)
7 sqoop执行导出时,java.util.NoSuchElementException 是因为导出的文件与数据库表的字段不一致,需要告诉sqoop正确的字段分隔符。
在创建hive表时,如果不指定分隔符,默认是ctrl-a,unicode编码是0x0001.
sqoop export --connect jdbc:mysql://×××/appresult2 --username hadoop --password 111111 -m 1 --table result_pv2 --export-dir /user/hive/warehouse/result_pv/000000_0 --input-fields-terminated-by '\0001';
8 sqoop执行导出到mysql时,报错:
java.lang.IllegalArgumentException
at java.sql.Timestamp.valueOf(Timestamp.java:257)
at result_url.__loadFromFields(result_url.java:201)
因为hive中的一个日期意义的字段中的数据为年月日 时:,导出到mysql中datetime类型的字段时报错。
9//必须在hadoop用户下执行hive命令,在hadoop的权限目录下执行get
create external table if not exists temp (ip STRING,identd STRING,username STRING,handledate STRING,requrl STRING,status int,returnbytes int,referer string,browser STRING) row format SERDE 'org.apache.hadoop.hive.serde2.dynamic_type.DynamicSerDe'WITH SERDEPROPERTIES ('serialization.format'='org.apache.hadoop.hive.serde2.thrift.TCTLSeparatedProtocol','quote.delim'='("|\\[|\\])','field.delim'=' ') STORED AS TEXTFILE location '/logs/apache/2013-02-25';
会解析成
{"ip":"223.198.228.244","identd":"-","username":"-","handledate":"[24/Feb/2013:22:00:54","requrl":"+0800]","status":null,"returnbytes":200,"referer":"52","browser":"http://×××/index.html"}
但create external table if not exists temp (ip STRING,identd STRING,username STRING,handledate STRING,requrl STRING,status int,returnbytes int,referer string,browser STRING) row format SERDE 'org.apache.hadoop.hive.serde2.dynamic_type.DynamicSerDe'WITH SERDEPROPERTIES ('serialization.format'='org.apache.hadoop.hive.serde2.thrift.TCTLSeparatedProtocol','quote.delim'='("|\\[|\\])','field.delim'=' ','serialization.null.format'='-') STORED AS TEXTFILE location '/logs/apache/2013-02-25';
解析出来的是对的,就加上了'serialization.null.format'='-‘,不知道为什么。
{"ip":"223.198.228.244","identd":null,"username":null,"handledate":"[24/Feb/2013:22:00:54 +0800]","status":200,"returnbytes":52,"referer":"http://×××/index.html","browser":"mozilla==="}
2 hive执行语句报错的问题:
hive> select "ttt",substring(dateconvert(handledate),0,14),urlconvert(requrl),count(1) from temp group by requrl,substring(handledate,0,14);
FAILED: Error in semantic analysis: Line 1:23 Expression not in GROUP BY key 'handledate'
改成select "ttt",substring(dateconvert(handledate),0,14),urlconvert(requrl),count(1) from temp group by substring(dateconvert(handledate),0,14),requrl;
3 使用udf的问题,要把jar包放在所有hadoop节点的lib目录下。每次更新jar包了需要重新在hive语句里add,每次执行新的会话,要重新create函数。==后来发现不是需要放在所有hadoop节点。
4 insert into table result_url (select "ttt",substring(dateconvert(handledate),0,14),urlconvert(requrl),count(1) from temp group by substring(dateconvert(handledate),0,14), requrl;#--ip,
执行报错:Parse Error: line 1:29 cannot recognize input near '(' 'select' '"ttt"' in select clause
改成insert into table result_url select "ttt",substring(dateconvert(handledate),0,14),urlconvert(requrl),count(1) from temp group by substring(dateconvert(handledate),0,14), requrl;#--ip,
或者
create table result_url2 as select "ttt",substring(dateconvert(handledate),0,14),urlconvert(requrl),count(1) from temp group by substring(dateconvert(handledate),0,14), requrl;
自动创建result_url2这个表,字段名为_c0,_c1,_c2,_c3
6 修改数据库用户密码,以root身份进入mysql,use mysql; update user set Password=PASSWORD('111111') where User='hadoop';flush privileges;(必须)
7 sqoop执行导出时,java.util.NoSuchElementException 是因为导出的文件与数据库表的字段不一致,需要告诉sqoop正确的字段分隔符。
在创建hive表时,如果不指定分隔符,默认是ctrl-a,unicode编码是0x0001.
sqoop export --connect jdbc:mysql://×××/appresult2 --username hadoop --password 111111 -m 1 --table result_pv2 --export-dir /user/hive/warehouse/result_pv/000000_0 --input-fields-terminated-by '\0001';
8 sqoop执行导出到mysql时,报错:
java.lang.IllegalArgumentException
at java.sql.Timestamp.valueOf(Timestamp.java:257)
at result_url.__loadFromFields(result_url.java:201)
因为hive中的一个日期意义的字段中的数据为年月日 时:,导出到mysql中datetime类型的字段时报错。
9//必须在hadoop用户下执行hive命令,在hadoop的权限目录下执行get