使用Spark sql的shell转换Oracle sql的例子

使用Sparksql的shell转换Oracle的例子

//1.创建表
create table AUTO_PAID_CASE_TMP_01
(
branch_company_code VARCHAR(25),
policy_no VARCHAR(60),
product_code_detail VARCHAR(24),
accident_month VARCHAR(20),
paid_amount Double,
paid_insured Double,
paid_alae Double,
paid_amount_retain Double,
paid_insured_retain Double,
paid_alae_retain Double,
paid_num_noncancel Double,
paid_num_zero Double,
paid_num_cancel Double,
paid_num_noncancel_detail Double,
paid_num_zero_detail Double,
paid_num_cancel_detail Double,
estimate_amount Double,
estimate_amount_retain Double,
os_num Double,
os_num_detail Double
)row format delimited fields terminated by ‘,’ ;

//2创建表
create table AUTO_DL1_BASE_PREMIUM_TMP01
(
branch_company_code VARCHAR(21),
policy_no VARCHAR(60),
department_group_code VARCHAR(18),
department_code VARCHAR(24),
product_type VARCHAR(24),
estimate_product_type VARCHAR(500),
business_require_system2 VARCHAR(500),
product_code VARCHAR(24),
product_code_detail VARCHAR(24),
inception_month VARCHAR(6),
inquire_month VARCHAR(6),
selling_channel_type VARCHAR(6),
policy_type VARCHAR(3),
policy_status VARCHAR(15),
party_type CHAR(1),
premium_amount Double,
premium_amount_static Double,
premium_amount_retain Double,
premium_amount_retain_static Double,
premium_amount_withtax Double,
exposure Double,
exposure_detail Double,
gep Double,
nep Double,
ee Double,
ee_detail Double,
gep_withtax Double,
actuarial_usage_code VARCHAR(50)
)row format delimited fields terminated by ‘,’ ;

//3.导入hive数据
load data local inpath ‘/room/data/haha.csv’ overwrite into table AUTO_PAID_CASE_TMP_01;
(load data local inpath ‘文件路径’ overwrite into table 表名 ,其中overwrite是覆盖,不是追加!

//加上下面这句是导入本地文件的语句csv“,”隔开的形式。
//insert overwrite local directory “/tmp/out/”
//row format delimited fields terminated by “,”

//4.这是创建查询结果表正确格式
create table query_result as
select
auto_dl1_base_premium_tmp01.branch_company_code,
auto_dl1_base_premium_tmp01.policy_no,
auto_dl1_base_premium_tmp01.department_group_code,
auto_dl1_base_premium_tmp01.department_code,
auto_dl1_base_premium_tmp01.product_code,
auto_dl1_base_premium_tmp01.product_code_detail,
auto_dl1_base_premium_tmp01.inception_month,
auto_dl1_base_premium_tmp01.inquire_month,
auto_dl1_base_premium_tmp01.selling_channel_type,
auto_dl1_base_premium_tmp01.policy_type,
auto_dl1_base_premium_tmp01.policy_status,
auto_dl1_base_premium_tmp01.premium_amount,
auto_dl1_base_premium_tmp01.premium_amount_static,
auto_dl1_base_premium_tmp01.premium_amount_retain,
auto_dl1_base_premium_tmp01.premium_amount_retain_static,
auto_dl1_base_premium_tmp01.premium_amount_withtax,
auto_dl1_base_premium_tmp01.exposure,
auto_dl1_base_premium_tmp01.exposure_detail,
auto_dl1_base_premium_tmp01.gep,
auto_dl1_base_premium_tmp01.nep,
auto_dl1_base_premium_tmp01.ee,
auto_dl1_base_premium_tmp01.ee_detail,
auto_dl1_base_premium_tmp01.party_type,
auto_dl1_base_premium_tmp01.product_type,
auto_dl1_base_premium_tmp01.estimate_product_type,
auto_dl1_base_premium_tmp01.business_require_system2,

case when  first_value(auto_dl1_base_premium_tmp01.inception_month)
          over(partition by auto_dl1_base_premium_tmp01.branch_company_code,
               auto_dl1_base_premium_tmp01.policy_no,
               auto_dl1_base_premium_tmp01.product_code_detail,
               auto_dl1_base_premium_tmp01.inquire_month order by auto_dl1_base_premium_tmp01.inception_month)=auto_dl1_base_premium_tmp01.inception_month 
       then auto_paid_case_tmp_01.paid_amount else 0 end paid_amount,

 case when first_value(auto_dl1_base_premium_tmp01.inception_month)
          over(partition by auto_dl1_base_premium_tmp01.branch_company_code,
               auto_dl1_base_premium_tmp01.policy_no,
               auto_dl1_base_premium_tmp01.product_code_detail,
               auto_dl1_base_premium_tmp01.inquire_month order by auto_dl1_base_premium_tmp01.inception_month)=auto_dl1_base_premium_tmp01.inception_month 
       then auto_paid_case_tmp_01.paid_amount_retain else 0 end paid_amount_retain,

case when first_value(auto_dl1_base_premium_tmp01.inception_month)
          over(partition by auto_dl1_base_premium_tmp01.branch_company_code,
               auto_dl1_base_premium_tmp01.policy_no,
               auto_dl1_base_premium_tmp01.product_code_detail,
               auto_dl1_base_premium_tmp01.inquire_month order by auto_dl1_base_premium_tmp01.inception_month)=auto_dl1_base_premium_tmp01.inception_month 
       then auto_paid_case_tmp_01.paid_insured else 0 end paid_insured,         

case when first_value(auto_dl1_base_premium_tmp01.inception_month)
          over(partition by auto_dl1_base_premium_tmp01.branch_company_code,
               auto_dl1_base_premium_tmp01.policy_no,
               auto_dl1_base_premium_tmp01.product_code_detail,
               auto_dl1_base_premium_tmp01.inquire_month order by auto_dl1_base_premium_tmp01.inception_month)=auto_dl1_base_premium_tmp01.inception_month 
       then auto_paid_case_tmp_01.paid_alae else 0 end paid_alae,           

    case when first_value(auto_dl1_base_premium_tmp01.inception_month)
          over(partition by auto_dl1_base_premium_tmp01.branch_company_code,
               auto_dl1_base_premium_tmp01.policy_no,
               auto_dl1_base_premium_tmp01.product_code_detail,
               auto_dl1_base_premium_tmp01.inquire_month order by auto_dl1_base_premium_tmp01.inception_month)=auto_dl1_base_premium_tmp01.inception_month 
       then auto_paid_case_tmp_01.paid_num_noncancel else 0 end paid_num_noncancel,       

    case when first_value(auto_dl1_base_premium_tmp01.inception_month)
          over(partition by auto_dl1_base_premium_tmp01.branch_company_code,
               auto_dl1_base_premium_tmp01.policy_no,
               auto_dl1_base_premium_tmp01.product_code_detail,
               auto_dl1_base_premium_tmp01.inquire_month order by auto_dl1_base_premium_tmp01.inception_month)=auto_dl1_base_premium_tmp01.inception_month 
       then auto_paid_case_tmp_01.paid_num_zero else 0 end paid_num_zero,   

  case when first_value(auto_dl1_base_premium_tmp01.inception_month)
          over(partition by auto_dl1_base_premium_tmp01.branch_company_code,
               auto_dl1_base_premium_tmp01.policy_no,
               auto_dl1_base_premium_tmp01.product_code_detail,
               auto_dl1_base_premium_tmp01.inquire_month order by auto_dl1_base_premium_tmp01.inception_month)=auto_dl1_base_premium_tmp01.inception_month 
       then auto_paid_case_tmp_01.paid_num_cancel else 0 end paid_num_cancel,

   case when first_value(auto_dl1_base_premium_tmp01.inception_month)
          over(partition by auto_dl1_base_premium_tmp01.branch_company_code,
               auto_dl1_base_premium_tmp01.policy_no,
               auto_dl1_base_premium_tmp01.product_code_detail,
               auto_dl1_base_premium_tmp01.inquire_month order by auto_dl1_base_premium_tmp01.inception_month)=auto_dl1_base_premium_tmp01.inception_month 
       then auto_paid_case_tmp_01.paid_num_noncancel_detail else 0 end paid_num_noncancel_detail,

   case when first_value(auto_dl1_base_premium_tmp01.inception_month)
          over(partition by auto_dl1_base_premium_tmp01.branch_company_code,
               auto_dl1_base_premium_tmp01.policy_no,
               auto_dl1_base_premium_tmp01.product_code_detail,
               auto_dl1_base_premium_tmp01.inquire_month order by auto_dl1_base_premium_tmp01.inception_month)=auto_dl1_base_premium_tmp01.inception_month 
       then auto_paid_case_tmp_01.paid_num_zero_detail else 0 end paid_num_zero_detail,

   case when first_value(auto_dl1_base_premium_tmp01.inception_month)
          over(partition by auto_dl1_base_premium_tmp01.branch_company_code,
               auto_dl1_base_premium_tmp01.policy_no,
               auto_dl1_base_premium_tmp01.product_code_detail,
               auto_dl1_base_premium_tmp01.inquire_month order by auto_dl1_base_premium_tmp01.inception_month)=auto_dl1_base_premium_tmp01.inception_month 
       then auto_paid_case_tmp_01.paid_num_cancel_detail else 0 end paid_num_cancel_detail,

   case when first_value(auto_dl1_base_premium_tmp01.inception_month)
          over(partition by auto_dl1_base_premium_tmp01.branch_company_code,
               auto_dl1_base_premium_tmp01.policy_no,
               auto_dl1_base_premium_tmp01.product_code_detail,
               auto_dl1_base_premium_tmp01.inquire_month order by auto_dl1_base_premium_tmp01.inception_month)=auto_dl1_base_premium_tmp01.inception_month 
       then auto_paid_case_tmp_01.estimate_amount else 0 end estimate_amount,

   case when first_value(auto_dl1_base_premium_tmp01.inception_month)
          over(partition by auto_dl1_base_premium_tmp01.branch_company_code,
               auto_dl1_base_premium_tmp01.policy_no,
               auto_dl1_base_premium_tmp01.product_code_detail,
               auto_dl1_base_premium_tmp01.inquire_month order by auto_dl1_base_premium_tmp01.inception_month)=auto_dl1_base_premium_tmp01.inception_month 
       then auto_paid_case_tmp_01.estimate_amount_retain else 0 end estimate_amount_retain,

   case when first_value(auto_dl1_base_premium_tmp01.inception_month)
          over(partition by auto_dl1_base_premium_tmp01.branch_company_code,
               auto_dl1_base_premium_tmp01.policy_no,
               auto_dl1_base_premium_tmp01.product_code_detail,
               auto_dl1_base_premium_tmp01.inquire_month order by auto_dl1_base_premium_tmp01.inception_month)=auto_dl1_base_premium_tmp01.inception_month 
       then auto_paid_case_tmp_01.os_num else 0 end os_num,

   case when first_value(auto_dl1_base_premium_tmp01.inception_month)
          over(partition by auto_dl1_base_premium_tmp01.branch_company_code,
               auto_dl1_base_premium_tmp01.policy_no,
               auto_dl1_base_premium_tmp01.product_code_detail,
               auto_dl1_base_premium_tmp01.inquire_month order by auto_dl1_base_premium_tmp01.inception_month)=auto_dl1_base_premium_tmp01.inception_month 
       then auto_paid_case_tmp_01.os_num_detail else 0 end os_num_detail,


   auto_dl1_base_premium_tmp01.gep_withtax,
   auto_dl1_base_premium_tmp01.actuarial_usage_code
    from auto_dl1_base_premium_tmp01 
    left join auto_paid_case_tmp_01 
      on auto_dl1_base_premium_tmp01.branch_company_code = auto_paid_case_tmp_01.branch_company_code
     and auto_dl1_base_premium_tmp01.product_code_detail = auto_paid_case_tmp_01.product_code_detail
     and auto_dl1_base_premium_tmp01.policy_no = auto_paid_case_tmp_01.policy_no
     and auto_dl1_base_premium_tmp01.inquire_month = auto_paid_case_tmp_01.accident_month;

结果:
用时Time taken: 43.05 seconds
17/08/29 19:29:25 INFO CliDriver: Time taken: 43.05 seconds

//5.查询结果语句sql
//
select substr(query_result.inquire_month,1,4),sum(query_result.premium_amount_static),sum(query_result.premium_amount_retain_static),sum(query_result.paid_amount)
,sum(query_result.paid_amount_retain),sum(query_result.estimate_amount),sum(query_result.estimate_amount_retain) from
query_result group by substr(query_result.inquire_month,1,4);

//结果如下:
2016 5.666219326377354E7 5.293598994840718E7 2.481366084999998E7 2.2746898331373088E7 1128048.84 1054172.1800000002
2017 3.9573576133207515E7 3.624134659292713E7 7865497.719999997 7087857.122956341 5014556.81 4547917.7
2014 2.0922988189999998E7 1.956592757000001E7 7445313.239999999 6525843.269999999 0.0 0.0
2013 1385740.8700000003 1014179.1199999993 99992.49 48603.060000000005 0.0 0.0
2015 4.0925074010000005E7 3.812092436400004E7 1.756812285E7 1.6017133919999989E7 37612.8 37612.8
Time taken: 4.11 seconds, Fetched 5 row(s)
17/08/30 11:31:10 INFO CliDriver: Time taken: 4.11 seconds, Fetched 5 row(s)

ps此次使用如下内存参数查询的指令:
./spark-sql –master yarn –driver-memory 4G –executor-memory 3G –num-executors 4

经过多次调优,现有机器条件下(三台内存,11G,8G,8G)下以及3T硬盘空间,能够达到的最好效果,数据源于保险行业真实数据,不方便透露,260万条数据与64万条数据的关联产生结果表,速度还是非常不错的。
结果

### 配置 Windows 环境下的 Spark SQL #### 1. 安装 Java 和设置环境变量 为了使 Spark 正常工作,Java 是必需的。下载并安装最新版本的 JDK,并配置 `JAVA_HOME` 环境变量指向 JDK 的安装路径。 #### 2. 下载和解压 Spark 从 Apache Spark 官方网站下载适用于 Hadoop 版本的预编译二进制文件。将其解压缩至所需位置,例如 `C:\spark`[^2]。 #### 3. 设置 SPARK_HOME 及 PATH 环境变量 创建一个新的系统环境变量名为 `SPARK_HOME` 并设其值为 Spark 解压后的根目录;接着编辑系统的 `PATH` 变量追加 `%SPARK_HOME%\bin;%SPARK_HOME%\sbin` 到现有值之后[^4]。 #### 4. 测试 Spark Shell 是否正常启动 打开命令提示符窗口(最好是以管理员权限),输入 `spark-shell` 命令来验证是否能顺利进入 Scala REPL 接口。如果一切无误,则说明基本配置已完成。 #### 5. 连接 Hive Metastore 为了让 Spark 能够访问存储于 Hive 中的数据表结构定义及其他元数据信息,需确保 Hive Metastore 处于运行状态。这可通过启动 Hive Server 或者通过其他方式保证 metastore service 正在监听默认端口号9083上的请求[^1]。 #### 6. 使用 Spark SQL 执行查询 一旦上述准备工作完成,就可以利用 PySpark、Scala 或者 R 编写应用程序来进行数据分析处理了。下面是一个简单的 Python 示例展示如何读取 JDBC 数据源中的表格并通过 Spark SQL 对其操作: ```python from pyspark.sql import SparkSession # 创建 SparkSession 实例 spark = SparkSession.builder \ .appName("example") \ .config("spark.sql.warehouse.dir", "file:///D:/tmp/spark-warehouse") \ .enableHiveSupport() \ .getOrCreate() # 构建数据库连接参数字典 dbConf = { 'url': 'jdbc:oracle:thin:@localhost:1521/orcl', 'driver': 'oracle.jdbc.driver.OracleDriver', } sql_query = "(select * from mtl_system_items_b where rownum < 10) as temp" dbConf['dbtable'] = sql_query df = spark.read.format('jdbc').options(**dbConf).load() df.createOrReplaceTempView("boms") print("显示结果:") spark.sql("SELECT * FROM boms").show() ```
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值