这几天在看Supersonic的代码,下面我自己根据其提供的hash join的示例改写的一个测试示例,实现了两个表做Hash Join。
编译:
g++ -I/home/kernel0017/supersonic/supersonic/ -I../glog/src/ -I../gflags/src/ -I../protobuf/src/ -I../re2 -I /home/kernel0017/supersonic/supersonic/gmock/gtest/include -L/usr/local/lib -lsupersonic -lgflags -lglog -lprotobuf -lre2 -lboost_timer -Wno-deprecated -g -o testhashjoin testhashjoin.cc
运行:./testhashjoin。
/home/kernel0017/supersonic/supersonic/是supersonic的头文件目录,../glog/src是glog的头文件目录,../gflags/src是gflags的头文件目录,../protobuf/src是protobuf的头文件目录,../re2是re2的头文件目录,/home/kernel0017/supersonic/supersonic/gmock/gtest/include是Supersonic源码中使用的gtest头文件目录,/usr/local/lib是所有用到的库文件的安装目录。
#include <map>
using std::map;using std::multimap;
#include <set>
using std::multiset;
using std::set;
#include <utility>
using std::make_pair;
using std::pair;
#include "gtest/gtest.h"
#include "supersonic/supersonic.h"
#include "supersonic/cursor/core/sort.h"
#include "supersonic/cursor/infrastructure/ordering.h"
#include "supersonic/utils/strings/stringpiece.h"
// Include some map utilities to use for result verification.
#include "supersonic/utils/map-util.h"
using supersonic::Attribute;
using supersonic::Block;
using supersonic::Cursor;
using supersonic::Operation;
using supersonic::FailureOr;
using supersonic::FailureOrOwned;
using supersonic::GetConstantExpressionValue;
using supersonic::TupleSchema;
using supersonic::Table;
using supersonic::TableRowWriter;
using supersonic::View;
using supersonic::ViewCopier;
using supersonic::HashJoinOperation;
using supersonic::HeapBufferAllocator;
using supersonic::JoinType;
using supersonic::ProjectNamedAttribute;
using supersonic::ProjectNamedAttributeAs;
using supersonic::rowid_t;
using supersonic::SingleSourceProjector;
using supersonic::MultiSourceProjector;
using supersonic::CompoundSingleSourceProjector;
using supersonic::CompoundMultiSourceProjector;
using supersonic::ResultView;
using supersonic::ScanView;
using supersonic::SucceedOrDie;
using supersonic::If;
using supersonic::IfNull;
using supersonic::Less;
using supersonic::CompoundExpression;
using supersonic::Expression;
using supersonic::Compute;
using supersonic::Generate;
using supersonic::ParseStringNulling;
using supersonic::ConstBool;
using supersonic::ConstString;
using supersonic::ConstInt32;
using supersonic::Null;
using supersonic::INNER;
using supersonic::UNIQUE;
using supersonic::INT32;
using supersonic::NOT_NULLABLE;
using supersonic::NULLABLE;
using supersonic::STRING;
using supersonic::DATE;
using supersonic::BOOL;
using supersonic::rowcount_t;
//本例实现一个Hash join。
//本测试也是用存在内存中的表,以便在表里插入行。我们也将用sinks来将计算数据传入tables里面。
class HashJoinTest {
public:
void SetUp() {
//author表和book表做hash join.简单起见,一本书只有一个作者。
author_schema.add_attribute(Attribute("author_id", INT32, NOT_NULLABLE));
author_schema.add_attribute(Attribute("name", STRING, NOT_NULLABLE));
author_schema.add_attribute(Attribute("nobel", BOOL, NOT_NULLABLE));
//supersonic提供了两种时间数据类型:DATE和DATETIME。DATE以天为单位,DATETIME以ms为单位。
//DATE和DATETIME对象分别是以32位和64位的整型存储的。
//我们也会处理空值,一个列为空,用Attribute(),如下date_published,值可为空的意思。
book_schema.add_attribute(Attribute("book_id", INT32, NOT_NULLABLE));
book_schema.add_attribute(Attribute("author_id_ref", INT32, NULLABLE));
book_schema.add_attribute(Attribute("title", STRING, NOT_NULLABLE));
book_schema.add_attribute(Attribute("date_published",
DATE,
NULLABLE));
//首先我们先创建tables;
author_table.reset(new Table(author_schema,
HeapBufferAllocator::Get()));
book_table.reset(new Table(book_schema,
HeapBufferAllocator::Get()));
//两种方法写入数据到tables中:
//1、TableRowWriter 比较适合于简单的测试环境。
//2、直接写入table。
author_table_writer.reset(new TableRowWriter(author_table.get()));
//输入计数器来产生ID。
author_count = 0;
book_count = 0;
}
void PrepareJoin() {
//
//在开始实现join之前,我们必须先考虑left hand side(lhs) 和 right hand side(rhs)的问题。
//在Supersonic right hand side本作为index,它应该是相对小的表。来自lhs cursor的数据
//以流的形式和index进行匹配。在本例中,authors明显要比books少,所以book表作为lhs,author表作为rhs.
//Supersonic可以为index开启一些特定的优化,我们不久就会涉及。
//我们现在为两个表准备single source projectors(key selectors)。
scoped_ptr<const SingleSourceProjector> book_selector(
ProjectNamedAttribute("author_id_ref"));
scoped_ptr<const SingleSourceProjector> author_selector(
ProjectNamedAttribute("author_id"));
//我们用一个mutisource projector来表示我们想要得到的结果,因此我们用CompoundMultiSourceProjector
//Supersonic将把两个schema与其绑定。我们现在指定哪些列要投影到result里面,并且要消除重复,
//我们可以对于重复的列想个新名字,也可以直接舍弃其中一些。
//
scoped_ptr<CompoundMultiSourceProjector> result_projector(
new CompoundMultiSourceProjector());
//add()函数用于multi source projector,不像single source需要两个参数,source index和single source projector
//我们现在需要指定哪些属性要投影。我们可以走捷径用ProjectAllAtributes,但是它对于我们要做join的两列产生不好的影响。
//
scoped_ptr<CompoundSingleSourceProjector> result_book_projector(
new CompoundSingleSourceProjector());
result_book_projector->add(ProjectNamedAttribute("title"));
result_book_projector->add(ProjectNamedAttribute("date_published"));
result_book_projector->add(ProjectNamedAttribute("book_id"));
scoped_ptr<CompoundSingleSourceProjector> result_author_projector(
new CompoundSingleSourceProjector());
result_author_projector->add(
ProjectNamedAttributeAs("name", "author_name"));
result_author_projector->add(ProjectNamedAttribute("nobel"));
result_author_projector->add(ProjectNamedAttribute("author_id"));
//将single source projector的内容写入result_projector中
result_projector->add(0, result_book_projector.release());
result_projector->add(1, result_author_projector.release());
//首先,我们要决定我们要进行什么类型的Join,目前Supersonic只支持两种:内连接和左外连接。
//为了将不知道作者的书的条目排除,此处用的是内连接
//其次,supersonic也要求我们检查rhs schema的数据,所有的关键字是不是唯一的。如果我们事先知道这个信息,
//我们就可以开启hash join优化。如果有重复值,或者我们不能确定是否有重复值,我们要用NOT_UNIQUE选项。
//在这个案例中,对于rhs index我们可以启用优化。
//现在我们来创建一个Operation
scoped_ptr<Operation> hash_join(
new HashJoinOperation(/* join type */ INNER,
/* select left */ book_selector.release(),
/* select right */ author_selector.release(),
/* project result */ result_projector.release(),
/* unique keys on the right ? */ UNIQUE,
/* left data */ ScanView(book_table->view()),
/* right data */ ScanView(author_table->view())));
result_cursor.reset(SucceedOrDie(hash_join->CreateCursor()));
}
//添加作者的方法会创造一个是否获过诺贝尔奖的条目。返回author_id用以关联books和authors.
int32 AddAuthor(const StringPiece& name, bool nobel) {
int32 author_id = author_count++;
// 在table中写数据时一定要注意字段的顺序。
author_table_writer
->AddRow().Int32(author_id).String(name).Bool(nobel).CheckSuccess();
return author_id;
}
//我们用直接的方法在book table里面写入数据。在这儿我们也增加了对于Null值的支持。
int32 AddBook(const StringPiece& title,
const StringPiece& date_published,
int32 author_id) {
int32 book_id = book_count++;
CHECK_EQ(book_id, book_table->row_count());
rowid_t row_id = book_table->AddRow();
// setting Attribute("book_id", INT32, NOT_NULLABLE).
book_table->Set<INT32>(0, row_id, book_id);
// setting Attribute("author_id_ref", INT32, NULLABLE).
if (author_id >= 0) {
book_table->Set<INT32>(1, row_id, author_id);
} else {
book_table->SetNull(1, row_id);
}
// setting Attribute("title", STRING, NOT_NULLABLE).
// This makes a deep copy of the StringPiece.
book_table->Set<STRING>(2, row_id, title);
// setting Attribute("date_published", DATE, NULLABLE).
//DATEs 内部表示是32位整型,我们用32位整型表示这个值。另一个方法就是存string到table里面来表示
//在我们调用之前用ParseStringNulling转换
//ParseStringNulling可以将string expression转换为date object。空输入或者无效输入将创建一个null entry
//DATETIME有一个DATE没有的捷径,即用ConstDateTime方法直接从StringPieces创建对象
scoped_ptr<const Expression> date_or_null(
ParseStringNulling(DATE, ConstString(date_published)));
bool date_published_is_null = false;
FailureOr<int32> data_published_as_int32 =
GetConstantExpressionValue<DATE>(*date_or_null,
&date_published_is_null);
CHECK(data_published_as_int32.is_success())
<< data_published_as_int32.exception().ToString();
if (!date_published_is_null) {
book_table->Set<DATE>(3, row_id, data_published_as_int32.get());
} else {
book_table->SetNull(3, row_id);
}
return book_id;
}
//将author names和book titles用ids(authors)和author reference ids(books)映射起来
typedef map<int32, StringPiece> author_name_map;
typedef multimap<int32, StringPiece> book_title_map;
// Utilities for storing pairs of (name, title).
typedef pair<StringPiece, StringPiece> author_book_entry;
typedef set<author_book_entry> author_book_set;
void TestResults() {
//检查结果是否满足需求,首先,我们必须把轮询rows,将它们放到一个内存块里。
scoped_ptr<Block> result_space(new Block(result_cursor->schema(),
HeapBufferAllocator::Get()));
ViewCopier copier(result_cursor->schema(), /* deep copy */ true);
rowcount_t offset = 0;
scoped_ptr<ResultView> rv(new ResultView(result_cursor->Next(-1)));
//!rv->is_done()的意思是游标既没有读完而且也没有发生错误的情况下,执行循环体。
while (!rv->is_done()) {
const View& view = rv->view();
rowcount_t view_row_count = view.row_count();
//为新值分配block,我们事先不知道需要多少个。
result_space->Reallocate(offset + view_row_count);
rowcount_t rows_copied = copier.Copy(view_row_count,
view,
offset,
result_space.get());
offset += rows_copied;
rv.reset(new ResultView(result_cursor->Next(-1)));
}
const View& result_view(result_space->view());
//输出生成的结果表。
for (int32 k=0;k<result_view.column_count();k++){
std::cout<<result_view.schema().attribute(k).name()<<"\t";
}
std::cout<<std::endl;
for(int j=0; j<result_view.row_count();j++)
{
std::cout<<result_view.column(0).typed_data<STRING>()[j]<<"\t";
std::cout<<result_view.column(1).typed_data<DATE>()[j]<<"\t";
std::cout<<result_view.column(2).typed_data<INT32>()[j]<<"\t";
std::cout<<result_view.column(3).typed_data<STRING>()[j]<<"\t";
std::cout<<result_view.column(4).typed_data<BOOL>()[j]<<"\t";
std::cout<<result_view.column(5).typed_data<INT32>()[j]<<"\t";
std::cout<<std::endl;
}
}
// Supersonic objects.
scoped_ptr<Cursor> result_cursor;
TupleSchema author_schema;
TupleSchema book_schema;
scoped_ptr<Table> author_table;
scoped_ptr<TableRowWriter> author_table_writer;
scoped_ptr<Table> book_table;
// Sequence counters.
int32 author_count;
int32 book_count;
};
int main(void) {
// DISCLAIMER: The values below should by no means be used as a reliable
// information source, especially the publishing dates are not accurate,
// although the years should match reality... :)
HashJoinTest test;
test.SetUp();
int32 terry_id = test.AddAuthor("Terry Pratchett", false);
int32 chuck_id = test.AddAuthor("Chuck Palahniuk", false);
int32 ernest_id = test.AddAuthor("Ernest Hemingway", true);
// Again, in a production environment one would use a simpler INT32 field
// if they didn't care about full dates, but we are excused by demonstration
// purposes.
test.AddBook("The Reaper Man", "1991/01/01", terry_id);
test.AddBook("Colour of Magic", "1983/01/01", terry_id);
test.AddBook("Light Fantastic", "1986/01/01", terry_id);
test.AddBook("Mort", NULL, terry_id);
test.AddBook("Fight Club", "1996/01/01", chuck_id);
test.AddBook("Survivor", NULL, chuck_id);
test.AddBook("Choke", "2001/01/01", chuck_id);
test.AddBook("The old man and the sea", NULL, ernest_id);
test.AddBook("For whom the bell tolls", NULL, ernest_id);
test.AddBook("A farewell to arms", "1929/01/01", ernest_id);
test.AddBook("Carpet People", NULL, -1);
test.AddBook("Producing open source software.", NULL, -1);
test.AddBook("Quantum computation and quantum information.", NULL, -1);
test.PrepareJoin();
test.TestResults();
return 0;
}