[root@master cleaned]# cd
[root@master ~]# hdfs dfs -ls /user/hive/warehouse/result_*/
Found 1 items
-rw-r--r-- 1 root supergroup 447 2025-12-29 09:27 /user/hive/warehouse/result_financing_survival_corr/000000_0
Found 1 items
-rw-r--r-- 1 root supergroup 23 2025-12-29 11:30 /user/hive/warehouse/result_founder_invest_analysis/000000_0
Found 1 items
-rw-r--r-- 1 root supergroup 939 2025-12-29 11:28 /user/hive/warehouse/result_longest_survivor_per_industry/000000_0
Found 1 items
-rw-r--r-- 1 root supergroup 187 2025-12-29 09:21 /user/hive/warehouse/result_policy_death_region/000000_0
Found 1 items
-rw-r--r-- 1 root supergroup 559 2025-12-29 09:16 /user/hive/warehouse/result_top_funded_company_rank/000000_0
Found 1 items
-rw-r--r-- 1 root supergroup 1388 2025-12-29 11:32 /user/hive/warehouse/result_top_investor_avg_survival/000000_0
[root@master ~]# hdfs dfs -cat /user/hive/warehouse/result_longest_survivor_per_industry/000000_0 | head -3
10652016-07-011
12122012-04-011
12472016-01-011
[root@master ~]# hdfs dfs -cat /user/hive/warehouse/result_top_investor_avg_survival/000000_0 | head -3北京2238365.002.238E9
广东1095365.001.095E9
上海1034365.001.034E9
[root@master ~]# mysql -u root -p181750Qy. -e "
> CREATE DATABASE IF NOT EXISTS bigdata_analysis DEFAULT CHARSET=utf8mb4;
>
> USE bigdata_analysis;
>
> -- 表1:每行业最长存活(3字段)
> CREATE TABLE IF NOT EXISTS result_longest_survivor_per_industry (
> industry VARCHAR(50),
> company_name VARCHAR(100),
> max_live_days INT
> ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
>
> -- 表2:创始人投资分析(3字段)
> CREATE TABLE IF NOT EXISTS result_founder_invest_analysis (
> investor VARCHAR(100),
> founder_company_count INT,
> avg_survival_days DECIMAL(10,2)
> ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
>
> -- 表3:顶级投资者表现(4字段)
> CREATE TABLE IF NOT EXISTS result_top_investor_avg_survival (
> top_investor VARCHAR(100),
> invested_company_count INT,
> avg_survival_days DECIMAL(10,2),
> total_investment DOUBLE
> ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
> "
[root@master ~]# mkdir -p /usr/local/datax/job
[root@master ~]# vim /usr/local/datax/job/job_longest_survivor.json
[root@master ~]# python /usr/local/datax/bin/datax.py /usr/local/datax/job/job_longest_survivor.json
DataX (DATAX-OPENSOURCE-3.0), From Alibaba !
Copyright (C) 2010-2017, Alibaba Group. All Rights Reserved.
2025-12-29 11:33:50.834 [main] INFO MessageSource - JVM TimeZone: GMT+08:00, Locale: zh_CN
2025-12-29 11:33:50.835 [main] INFO MessageSource - use Locale: zh_CN timeZone: sun.util.calendar.ZoneInfo[id="GMT+08:00",offset=28800000,dstSavings=0,useDaylight=false,transitions=0,lastRule=null]
2025-12-29 11:33:50.849 [main] INFO VMInfo - VMInfo# operatingSystem class => sun.management.OperatingSystemImpl
2025-12-29 11:33:50.854 [main] INFO Engine - the machine info =>
osInfo: Oracle Corporation 1.8 25.121-b13
jvmInfo: Linux amd64 3.10.0-1160.119.1.el7.x86_64
cpu num: 4
totalPhysicalMemory: -0.00G
freePhysicalMemory: -0.00G
maxFileDescriptorCount: -1
currentOpenFileDescriptorCount: -1
GC Names [PS MarkSweep, PS Scavenge]
MEMORY_NAME | allocation_size | init_size
PS Eden Space | 256.00MB | 256.00MB
Code Cache | 240.00MB | 2.44MB
Compressed Class Space | 1,024.00MB | 0.00MB
PS Survivor Space | 42.50MB | 42.50MB
PS Old Gen | 683.00MB | 683.00MB
Metaspace | -0.00MB | 0.00MB
2025-12-29 11:33:50.867 [main] INFO Engine -
{
"content":[
{
"reader":{
"name":"hdfsreader",
"parameter":{
"column":[
{
"index":0,
"type":"string"
},
{
"index":1,
"type":"string"
},
{
"index":2,
"type":"long"
}
],
"defaultFS":"hdfs://master:9000",
"encoding":"UTF-8",
"fieldDelimiter":"\u0001",
"fileType":"text",
"path":"/user/hive/warehouse/result_longest_survivor_per_industry/*"
}
},
"writer":{
"name":"mysqlwriter",
"parameter":{
"column":[
"industry",
"company_name",
"max_live_days"
],
"connection":[
{
"jdbcUrl":"jdbc:mysql://localhost:3306/bigdata_analysis?useSSL=false&serverTimezone=UTC",
"table":[
"result_longest_survivor_per_industry"
]
}
],
"password":"*********",
"username":"root"
}
}
}
],
"setting":{
"speed":{
"channel":1
}
}
}
2025-12-29 11:33:50.882 [main] WARN Engine - prioriy set to 0, because NumberFormatException, the value is: null
2025-12-29 11:33:50.883 [main] INFO PerfTrace - PerfTrace traceId=job_-1, isEnable=false, priority=0
2025-12-29 11:33:50.883 [main] INFO JobContainer - DataX jobContainer starts job.
2025-12-29 11:33:50.884 [main] INFO JobContainer - Set jobId = 0
2025-12-29 11:33:50.898 [job-0] INFO HdfsReader$Job - init() begin...
2025-12-29 11:33:51.365 [job-0] INFO HdfsReader$Job - hadoopConfig details:{"finalParameters":[]}
2025-12-29 11:33:51.365 [job-0] INFO HdfsReader$Job - init() ok and end...
2025-12-29 11:33:51.716 [job-0] INFO OriginalConfPretreatmentUtil - table:[result_longest_survivor_per_industry] all columns:[
industry,company_name,max_live_days
].
2025-12-29 11:33:51.721 [job-0] INFO OriginalConfPretreatmentUtil - Write data [
INSERT INTO %s (industry,company_name,max_live_days) VALUES(?,?,?)
], which jdbcUrl like:[jdbc:mysql://localhost:3306/bigdata_analysis?useSSL=false&serverTimezone=UTC&yearIsDateType=false&zeroDateTimeBehavior=CONVERT_TO_NULL&rewriteBatchedStatements=true&tinyInt1isBit=false]
2025-12-29 11:33:51.722 [job-0] INFO JobContainer - jobContainer starts to do prepare ...
2025-12-29 11:33:51.722 [job-0] INFO JobContainer - DataX Reader.Job [hdfsreader] do prepare work .
2025-12-29 11:33:51.722 [job-0] INFO HdfsReader$Job - prepare(), start to getAllFiles...
2025-12-29 11:33:51.723 [job-0] INFO HdfsReader$Job - get HDFS all files in path = [/user/hive/warehouse/result_longest_survivor_per_industry/*]
十二月 29, 2025 11:33:51 上午 org.apache.hadoop.util.NativeCodeLoader <clinit>
警告: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2025-12-29 11:33:52.414 [job-0] INFO HdfsReader$Job - [hdfs://master:9000/user/hive/warehouse/result_longest_survivor_per_industry/000000_0]是[text]类型的文件, 将该文件加入source files列表
2025-12-29 11:33:52.416 [job-0] INFO HdfsReader$Job - 您即将读取的文件数为: [1], 列表为: [hdfs://master:9000/user/hive/warehouse/result_longest_survivor_per_industry/000000_0]
2025-12-29 11:33:52.416 [job-0] INFO JobContainer - DataX Writer.Job [mysqlwriter] do prepare work .
2025-12-29 11:33:52.417 [job-0] INFO JobContainer - jobContainer starts to do split ...
2025-12-29 11:33:52.417 [job-0] INFO JobContainer - Job set Channel-Number to 1 channels.
2025-12-29 11:33:52.417 [job-0] INFO HdfsReader$Job - split() begin...
2025-12-29 11:33:52.417 [job-0] INFO JobContainer - DataX Reader.Job [hdfsreader] splits to [1] tasks.
2025-12-29 11:33:52.417 [job-0] INFO JobContainer - DataX Writer.Job [mysqlwriter] splits to [1] tasks.
2025-12-29 11:33:52.425 [job-0] INFO JobContainer - jobContainer starts to do schedule ...
2025-12-29 11:33:52.431 [job-0] INFO JobContainer - Scheduler starts [1] taskGroups.
2025-12-29 11:33:52.434 [job-0] INFO JobContainer - Running by standalone Mode.
2025-12-29 11:33:52.462 [taskGroup-0] INFO TaskGroupContainer - taskGroupId=[0] start [1] channels for [1] tasks.
2025-12-29 11:33:52.466 [taskGroup-0] INFO Channel - Channel set byte_speed_limit to -1, No bps activated.
2025-12-29 11:33:52.466 [taskGroup-0] INFO Channel - Channel set record_speed_limit to -1, No tps activated.
2025-12-29 11:33:52.476 [taskGroup-0] INFO TaskGroupContainer - taskGroup[0] taskId[0] attemptCount[1] is started
2025-12-29 11:33:52.499 [0-0-0-reader] INFO HdfsReader$Job - hadoopConfig details:{"finalParameters":["mapreduce.job.end-notification.max.retry.interval","mapreduce.job.end-notification.max.attempts"]}
2025-12-29 11:33:52.500 [0-0-0-reader] INFO Reader$Task - read start
2025-12-29 11:33:52.500 [0-0-0-reader] INFO Reader$Task - reading file : [hdfs://master:9000/user/hive/warehouse/result_longest_survivor_per_industry/000000_0]
2025-12-29 11:33:52.520 [0-0-0-reader] INFO UnstructuredStorageReaderUtil - CsvReader使用默认值[{"captureRawRecord":true,"columnCount":0,"comment":"#","currentRecord":-1,"delimiter":"\u0001","escapeMode":1,"headerCount":0,"rawRecord":"","recordDelimiter":"\u0000","safetySwitch":false,"skipEmptyRecords":true,"textQualifier":"\"","trimWhitespace":true,"useComments":false,"useTextQualifier":true,"values":[]}],csvReaderConfig值为[null]
2025-12-29 11:33:52.529 [0-0-0-reader] INFO Reader$Task - end read source files...
2025-12-29 11:33:52.576 [taskGroup-0] INFO TaskGroupContainer - taskGroup[0] taskId[0] is successed, used[104]ms
2025-12-29 11:33:52.577 [taskGroup-0] INFO TaskGroupContainer - taskGroup[0] completed it's tasks.
2025-12-29 11:34:02.479 [job-0] INFO StandAloneJobContainerCommunicator - Total 43 records, 540 bytes | Speed 54B/s, 4 records/s | Error 0 records, 0 bytes | All Task WaitWriterTime 0.000s | All Task WaitReaderTime 0.000s | Percentage 100.00%
2025-12-29 11:34:02.479 [job-0] INFO AbstractScheduler - Scheduler accomplished all tasks.
2025-12-29 11:34:02.480 [job-0] INFO JobContainer - DataX Writer.Job [mysqlwriter] do post work.
2025-12-29 11:34:02.480 [job-0] INFO JobContainer - DataX Reader.Job [hdfsreader] do post work.
2025-12-29 11:34:02.480 [job-0] INFO JobContainer - DataX jobId [0] completed successfully.
2025-12-29 11:34:02.481 [job-0] INFO HookInvoker - No hook invoked, because base dir not exists or is a file: /usr/local/datax/hook
2025-12-29 11:34:02.482 [job-0] INFO JobContainer -
[total cpu info] =>
averageCpu | maxDeltaCpu | minDeltaCpu
-1.00% | -1.00% | -1.00%
[total gc info] =>
NAME | totalGCCount | maxDeltaGCCount | minDeltaGCCount | totalGCTime | maxDeltaGCTime | minDeltaGCTime
PS MarkSweep | 1 | 1 | 1 | 0.019s | 0.019s | 0.019s
PS Scavenge | 1 | 1 | 1 | 0.016s | 0.016s | 0.016s
2025-12-29 11:34:02.482 [job-0] INFO JobContainer - PerfTrace not enable!
2025-12-29 11:34:02.482 [job-0] INFO StandAloneJobContainerCommunicator - Total 43 records, 540 bytes | Speed 54B/s, 4 records/s | Error 0 records, 0 bytes | All Task WaitWriterTime 0.000s | All Task WaitReaderTime 0.000s | Percentage 100.00%
2025-12-29 11:34:02.485 [job-0] INFO JobContainer -
任务启动时刻 : 2025-12-29 11:33:50
任务结束时刻 : 2025-12-29 11:34:02
任务总计耗时 : 11s
任务平均流量 : 54B/s
记录写入速度 : 4rec/s
读出记录总数 : 43
读写失败总数 : 0
[root@master ~]# python /usr/local/datax/bin/datax.py /usr/local/datax/job/job_founder_invest.json
DataX (DATAX-OPENSOURCE-3.0), From Alibaba !
Copyright (C) 2010-2017, Alibaba Group. All Rights Reserved.
2025-12-29 11:34:03.125 [main] ERROR Engine -
经DataX智能分析,该任务最可能的错误原因是:
com.alibaba.datax.common.exception.DataXException: Code:[Framework-03], Description:[DataX引擎配置错误,该问题通常是由于DataX安装错误引起,请联系您的运维解决 .]. - 获取作业配置信息失败:/usr/local/datax/job/job_founder_invest.json - java.io.FileNotFoundException: File '/usr/local/datax/job/job_founder_invest.json' does not exist
at org.apache.commons.io.FileUtils.openInputStream(FileUtils.java:299)
at org.apache.commons.io.FileUtils.readFileToString(FileUtils.java:1711)
at org.apache.commons.io.FileUtils.readFileToString(FileUtils.java:1748)
at com.alibaba.datax.core.util.ConfigParser.getJobContent(ConfigParser.java:106)
at com.alibaba.datax.core.util.ConfigParser.parseJobConfig(ConfigParser.java:74)
at com.alibaba.datax.core.util.ConfigParser.parse(ConfigParser.java:26)
at com.alibaba.datax.core.Engine.entry(Engine.java:138)
at com.alibaba.datax.core.Engine.main(Engine.java:208)
at com.alibaba.datax.common.exception.DataXException.asDataXException(DataXException.java:41)
at com.alibaba.datax.core.util.ConfigParser.getJobContent(ConfigParser.java:108)
at com.alibaba.datax.core.util.ConfigParser.parseJobConfig(ConfigParser.java:74)
at com.alibaba.datax.core.util.ConfigParser.parse(ConfigParser.java:26)
at com.alibaba.datax.core.Engine.entry(Engine.java:138)
at com.alibaba.datax.core.Engine.main(Engine.java:208)
Caused by: java.io.FileNotFoundException: File '/usr/local/datax/job/job_founder_invest.json' does not exist
at org.apache.commons.io.FileUtils.openInputStream(FileUtils.java:299)
at org.apache.commons.io.FileUtils.readFileToString(FileUtils.java:1711)
at org.apache.commons.io.FileUtils.readFileToString(FileUtils.java:1748)
at com.alibaba.datax.core.util.ConfigParser.getJobContent(ConfigParser.java:106)
... 4 more
[root@master ~]# python /usr/local/datax/bin/datax.py /usr/local/datax/job/job_top_investor.json
DataX (DATAX-OPENSOURCE-3.0), From Alibaba !
Copyright (C) 2010-2017, Alibaba Group. All Rights Reserved.
2025-12-29 11:34:03.504 [main] ERROR Engine -
经DataX智能分析,该任务最可能的错误原因是:
com.alibaba.datax.common.exception.DataXException: Code:[Framework-03], Description:[DataX引擎配置错误,该问题通常是由于DataX安装错误引起,请联系您的运维解决 .]. - 获取作业配置信息失败:/usr/local/datax/job/job_top_investor.json - java.io.FileNotFoundException: File '/usr/local/datax/job/job_top_investor.json' does not exist
at org.apache.commons.io.FileUtils.openInputStream(FileUtils.java:299)
at org.apache.commons.io.FileUtils.readFileToString(FileUtils.java:1711)
at org.apache.commons.io.FileUtils.readFileToString(FileUtils.java:1748)
at com.alibaba.datax.core.util.ConfigParser.getJobContent(ConfigParser.java:106)
at com.alibaba.datax.core.util.ConfigParser.parseJobConfig(ConfigParser.java:74)
at com.alibaba.datax.core.util.ConfigParser.parse(ConfigParser.java:26)
at com.alibaba.datax.core.Engine.entry(Engine.java:138)
at com.alibaba.datax.core.Engine.main(Engine.java:208)
at com.alibaba.datax.common.exception.DataXException.asDataXException(DataXException.java:41)
at com.alibaba.datax.core.util.ConfigParser.getJobContent(ConfigParser.java:108)
at com.alibaba.datax.core.util.ConfigParser.parseJobConfig(ConfigParser.java:74)
at com.alibaba.datax.core.util.ConfigParser.parse(ConfigParser.java:26)
at com.alibaba.datax.core.Engine.entry(Engine.java:138)
at com.alibaba.datax.core.Engine.main(Engine.java:208)
Caused by: java.io.FileNotFoundException: File '/usr/local/datax/job/job_top_investor.json' does not exist
at org.apache.commons.io.FileUtils.openInputStream(FileUtils.java:299)
at org.apache.commons.io.FileUtils.readFileToString(FileUtils.java:1711)
at org.apache.commons.io.FileUtils.readFileToString(FileUtils.java:1748)
at com.alibaba.datax.core.util.ConfigParser.getJobContent(ConfigParser.java:106)
... 4 more
[root@master ~]# mysql -u root -p181750Qy. -e "SELECT * FROM bigdata_analysis.result_longest_survivor_per_industry;"
+--------------+--------------------+---------------+
| industry | company_name | max_live_days |
+--------------+--------------------+---------------+
| 金融 | 信诚贷 | 999 |
| 金融 | 比特人 | 999 |
| 电子商务 | 折子网 | 999 |
| 游戏 | 千奇网络 | 999 |
| 汽车交通 | 淘车乐 | 999 |
| 新工业 | 必因科技 | 999 |
| 广告营销 | 纸指天下 | 999 |
| 企业服务 | Smart2On | 999 |
| 企业服务 | 九千年 | 999 |
| 文娱传媒 | 电音网 | 998 |
| 文娱传媒 | 爱豆影视 | 998 |
| 医疗健康 | 爱健康 | 998 |
| 社交网络 | 法天下 | 998 |
| 本地生活 | 一点通 | 997 |
| 本地生活 | 威力恩 | 997 |
| 旅游 | 去海钓网 | 997 |
| 教育 | 零距校园网 | 997 |
| 本地生活 | 帮个忙 | 997 |
| 硬件 | 中博宏大 | 997 |
| 工具软件 | 51CV.me | 997 |
| 工具软件 | 通知盒amybox | 997 |
| 房产服务 | E居多得 | 996 |
| 物流 | 运多多 | 981 |
| 体育运动 | 速播体育 | 949 |
| 农业 | 华夏康家 | 390 |
| 1065 | 2016-07-01 | 1 |
| 1212 | 2012-04-01 | 1 |
| 1247 | 2016-01-01 | 1 |
| 1278 | 2015-12-01 | 1 |
| 1400 | 2015-08-01 | 4 |
| 1521 | 2014-11-01 | 2 |
| 1522 | 2014-05-01 | 1 |
| 1825 | 2013-01-01 | 1 |
| 1887 | 2011-11-01 | 1 |
| 2039 | 2012-06-01 | 1 |
| 2070 | 2012-05-01 | 2 |
| 2131 | 2010-03-01 | 1 |
| 2222 | 2006-12-01 | 1 |
| 2556 | 2006-01-01 | 1 |
| 418 | 2016-06-01 | 1 |
| 725 | 2013-08-01 | 1 |
| 734 | 2014-08-01 | 1 |
| 740 | 2014-09-01 | 1 |
| 791 | 2017-04-01 | 1 |
| 822 | 2017-03-01 | 1 |
| 854 | 2014-04-01 | 1 |
| 942 | 2013-01-01 | 1 |
| 944 | 2014-01-01 | 1 |
| 950 | 2015-01-01 | 1 |
| 企业服务 | 龙腾天下 | 789 |
| 体育运动 | 骑程网 | 73 |
| 农业 | 许鲜网 | 4 |
| 医疗健康 | 鼎诚智能科技 | 201 |
| 工具软件 | 鸸鹋爱卖萌 | 374 |
| 广告营销 | 魅蓝互动 | 172 |
| 房产服务 | 鼎家网络 | 124 |
| 教育 | 齐贤教育 | 376 |
| 文娱传媒 | 鸡蛋娱乐 | 421 |
| 新工业 | 鼎喜手机 | 72 |
| 旅游 | 麻游旅行 | 255 |
| 本地生活 | 黄帽子劳务 | 595 |
| 汽车交通 | 麦麦车 | 311 |
| 游戏 | 龙辰文化 | 334 |
| 物流 | 马上快递app | 58 |
| 电子商务 | 齐表网 | 925 |
| 硬件 | 黑豆吉他 | 178 |
| 社交网络 | 麦浪 | 500 |
| 金融 | 龙矿科技 | 510 |
+--------------+--------------------+---------------+
[root@master ~]# mysql -u root -p181750Qy. -e "SELECT * FROM bigdata_analysis.result_founder_invest_analysis;"
[root@master ~]# mysql -u root -p181750Qy. -e "SELECT * FROM bigdata_analysis.result_top_investor_avg_survival;"
[root@master ~]# hdfs dfs -ls /input/cleaned_data/
Found 1 items
-rw-r--r-- 1 root supergroup 2455476 2025-12-29 11:15 /input/cleaned_data/companies.csv
[root@master ~]#