数据的查询和展示是superset的核心功能,前端用D3.js来渲染各种图标,后端用pandas来处理各种数据。
superset的数据查询功能也很简单,如果懂sql的话,只要在sueprset的explore页面配置一下左侧的图标类型,查询条件,例如,过滤字段,聚合查询/非聚合查询的字段,查询时间等,然后点页面上方的查询按钮,页面右侧就会按你选择的图表和查询条件来展示查询结果。今天我们来分析一下superset是如何查询数据并且把数据展示出来的。
superset查询数据的原理:
1.将前端的配置信息form_data传给explore_json函数
2.根据所选的图表类型,找到对应的图表类
3.根据过滤条件生成sql查询语句
4.根据数据库的连接条件找到对应的数据库engine,创建engine
5.使用pandas的read_sql函数获取查询结果,并生成一个dataframe
具体看一下代码:
- superset/views/core.py Superset类的explore_json
@log_this
@has_access_api
@expose('/explore_json/<datasource_type>/<datasource_id>/', methods=['GET', 'POST'])
@expose('/explore_json/', methods=['GET', 'POST'])
def explore_json(self, datasource_type=None, datasource_id=None):
try:
csv = request.args.get('csv') == 'true'
query = request.args.get('query') == 'true'
xlsx=request.args.get('xlsx') == 'true'
force = request.args.get('force') == 'true'
form_data = self.get_form_data()[0]
datasource_id, datasource_type = self.datasource_info(
datasource_id, datasource_type, form_data)
except Exception as e:
logging.exception(e)
return json_error_response(
utils.error_msg_from_exception(e),
stacktrace=traceback.format_exc())
return self.generate_json(datasource_type=datasource_type,
datasource_id=datasource_id,
form_data=form_data,
csv=csv,
query=query,
force=force,xlsx=xlsx)
- superset/viz.py BaseViz的get_df
def get_df(self, query_obj=None):
"""Returns a pandas dataframe based on the query object"""
if not query_obj:
query_obj = self.query_obj()
if not query_obj:
return None
self.error_msg = ''
self.results = None
timestamp_format = None
if self.datasource.type == 'table':
dttm_col = self.datasource.get_col(query_obj['granularity'])
if dttm_col:
timestamp_format = dttm_col.python_date_format
# The datasource here can be different backend but the interface is common
self.results = self.datasource.query(query_obj, self.viz_type) # Sqltable.query方法
self.query = self.results.query
self.status = self.results.status
self.error_message = self.results.error_message
pass
sqltable.query:
def query(self, query_obj, viz_type=None, special_sql=None):
def format_time_grain(index, time_grain_sqla=None):
if index.name == '__timestamp':
index = index.apply(lambda x: time_grain_convert(x, time_grain_sqla))
return index
qry_start_dttm = datetime.now()
status = QueryStatus.SUCCESS
error_message = None
df = None
if special_sql:
sql = special_sql
else:
time_grain_sqla = query_obj["extras"].get('time_grain_sqla',None)
sql = self.get_query_str(query_obj)
try:
df = self.database.get_df(sql, self.schema) # database的get_df方法
except Exception as e:
status = QueryStatus.FAILED
logging.exception(e)
error_message = (
self.database.db_engine_spec.extract_error_message(e))
pass
database.get_df:
def get_df(self, sql, schema):
sql = sql.strip().strip(';')
eng = self.get_sqla_engine(schema=schema) # 创建engine
df = pd.read_sql(sql, eng) # 获取查询结果
def needs_conversion(df_series):
if df_series.empty:
return False
if isinstance(df_series[0], (list, dict)):
return True
return False
for k, v in df.dtypes.items():
if v.type == numpy.object_ and needs_conversion(df[k]):
df[k] = df[k].apply(utils.json_dumps_w_dates)
return df