Kylo 之 spark-job-profiler 源码阅读

https://github.com/Teradata/kylo/tree/master/integrations/spark/spark-job-profiler

A Spark job capable of performing generating profile statistics against a source table, partition, or for a provided query.

流程

  • Data is typically read from a source table such as -valid and a given partition.
  • Profile statistics are generated.
  • Profiler statistics is written to -profile table.

加载数据

    @Inject
    private SparkContextService scs;

    @Inject
    private SQLContext sqlContext;
    
    DataSet dataDF = scs.toDataSet(sqlContext.createDataFrame(dataRDD, schema));

生成 Profile statistics

    ProfilerConfiguration configuration = new ProfilerConfiguration();
    
    StatisticsModel statsModel = profiler.profile(dataDF, configuration);
package com.thinkbiganalytics.spark.dataprofiler;

/*-
 * #%L
 * kylo-spark-job-profiler-api
 * %%
 * Copyright (C) 2017 ThinkBig Analytics
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import java.io.Serializable;

/**
 * Helper class to hold parameters for profiler
 */
@SuppressWarnings("unused")
public class ProfilerConfiguration implements Serializable {

    private static final long serialVersionUID = -6099960489540200374L;

    private Integer decimalDigitsToDisplayConsoleOutput = 4;
    private String inputAndOutputTablePartitionKey = "partitionKey";
    private String inputTablePartitionColumnName = "processing_dttm";
    private Integer numberOfTopNValues = 5;
    private String outputDbName = "default";
    private String outputTableName = "profilestats";
    private String outputTablePartitionColumnName = "processing_dttm";
    private String sqlDialect = "hiveql";  // Hive supported HQL
    private Integer bins = 5;

    /**
     * Number of decimals to print out in console<br>
     * (not considered when writing to table)
     */
    public Integer getDecimalDigitsToDisplayConsoleOutput() {
        return decimalDigitsToDisplayConsoleOutput;
    }

    public void setDecimalDigitsToDisplayConsoleOutput(Integer decimalDigitsToDisplayConsoleOutput) {
        this.decimalDigitsToDisplayConsoleOutput = decimalDigitsToDisplayConsoleOutput;
    }

    /**
     * Partition key to read and write to
     */
    public String getInputAndOutputTablePartitionKey() {
        return inputAndOutputTablePartitionKey;
    }

    public void setInputAndOutputTablePartitionKey(String inputAndOutputTablePartitionKey) {
        this.inputAndOutputTablePartitionKey = inputAndOutputTablePartitionKey;
    }

    /**
     * Partition column name for input table
     */
    public String getInputTablePartitionColumnName() {
        return inputTablePartitionColumnName;
    }

    public void setInputTablePartitionColumnName(String inputTablePartitionColumnName) {
        this.inputTablePartitionColumnName = inputTablePartitionColumnName;
    }

    /**
     * N for top-N values to store in result table<br>
     * A required command line parameter
     */
    public Integer getNumberOfTopNValues() {
        return numberOfTopNValues;
    }

    public void setNumberOfTopNValues(Integer numberOfTopNValues) {
        this.numberOfTopNValues = numberOfTopNValues;
    }

    /**
     * Name of database to write result to
     */
    public String getOutputDbName() {
        return outputDbName;
    }

    public void setOutputDbName(String outputDbName) {
        this.outputDbName = outputDbName;
    }

    /**
     * Name of table to write result to<br>
     * A required command line parameter
     */
    public String getOutputTableName() {
        return outputTableName;
    }

    public void setOutputTableName(String outputTableName) {
        this.outputTableName = outputTableName;
    }

    /**
     * Partition column name for output table
     */
    public String getOutputTablePartitionColumnName() {
        return outputTablePartitionColumnName;
    }

    public void setOutputTablePartitionColumnName(String outputTablePartitionColumnName) {
        this.outputTablePartitionColumnName = outputTablePartitionColumnName;
    }

    /**
     * Gets the flavor of queries to run.
     */
    public String getSqlDialect() {
        return sqlDialect;
    }

    public void setSqlDialect(String sqlDialect) {
        this.sqlDialect = sqlDialect;
    }

    public Integer getBins() {
        return this.bins;
    }

    /**
     * Set the number of histogram bins to generate
     * @param bins the number 
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值