Greenplum 6.X 资源管理源码分析

本文适用Greenplum Database(GPDB)源码 6X_STABLE 分支。

一、源码路径

  • src\backend\utils\resgroup
  • src\backend\utils\resource_manager
  • src\backend\utils\resowner
  • src\backend\utils\resscheduler

二、内存管理

初始化共享内存:

#2  0x0000000000ddf80d in ResManagerShmemInit () at resource_manager.c:45
#3  0x0000000000be253d in CreateSharedMemoryAndSemaphores (port=6407) at ipci.c:284
#4  0x0000000000b8ce01 in reset_shared (port=6407) at postmaster.c:2915
#5  0x0000000000b8a825 in PostmasterMain (argc=5, argv=0x2f3c390) at postmaster.c:1331
#6  0x0000000000a93fc2 in main (argc=5, argv=0x2f3c390) at main.c:249

创建内存Context,用于palloc截取上下文:

MemoryContext
AllocSetContextCreate(MemoryContext parent,
					  const char *name,
					  Size minContextSize,
					  Size initBlockSize,
					  Size maxBlockSize);

通过切换上下文方式,切换当前内存Context,例如,

old_ctx = MemoryContextSwitchTo(new_ctx);
…… do something
MemoryContextSwitchTo(old_ctx);

当调用palloc时,

void *
palloc(Size size)
{
	……
  // CurrentMemoryContext->methods.alloc函数指针指向分配内存处理函数。
	ret = (*CurrentMemoryContext->methods.alloc) (CurrentMemoryContext, size);
	……
	return ret;
}

三、资源组

创建cgroups层级

src\backend\utils\resgroup\resgroup-ops-linux.c:buildPathSafe

数据结构与接口

  • src\include\utils\resgroup.h,资源组基本数据结构与接口定义

资源组创建、删除和修改处理函数

extern void CreateResourceGroup(CreateResourceGroupStmt *stmt);
extern void DropResourceGroup(DropResourceGroupStmt *stmt);
extern void AlterResourceGroup(AlterResourceGroupStmt *stmt);

关于资源类型的枚举

src\include\catalog\pg_resgroup.h:
typedef enum ResGroupLimitType
{
	RESGROUP_LIMIT_TYPE_UNKNOWN = 0,

	RESGROUP_LIMIT_TYPE_CONCURRENCY,
	RESGROUP_LIMIT_TYPE_CPU,
	RESGROUP_LIMIT_TYPE_MEMORY,
	RESGROUP_LIMIT_TYPE_MEMORY_SHARED_QUOTA,
	RESGROUP_LIMIT_TYPE_MEMORY_SPILL_RATIO,
	RESGROUP_LIMIT_TYPE_MEMORY_AUDITOR,
	RESGROUP_LIMIT_TYPE_CPUSET,

	RESGROUP_LIMIT_TYPE_COUNT,
} ResGroupLimitType;

创建资源组时的堆栈

#0  CreateResourceGroup (stmt=stmt@entry=0x26723e8) at resgroupcmds.c:103
#1  0x0000000000a7e7bb in standard_ProcessUtility (parsetree=0x26723e8, 
    queryString=0x2670e08 "CREATE RESOURCE GROUP rgroup1 WITH (CPU_RATE_LIMIT=5, MEMORY_LIMIT=25, MEMORY_SPILL_RATIO=20);", context=PROCESS_UTILITY_TOPLEVEL, 
    params=0x0, dest=0x2672728, completionTag=0x7ffd9572a250 "") at utility.c:863
#2  0x0000000000a7b8b5 in PortalRunUtility (portal=portal@entry=0x287cc58, utilityStmt=utilityStmt@entry=0x26723e8, isTopLevel=isTopLevel@entry=1 '\001', 
    dest=dest@entry=0x2672728, completionTag=completionTag@entry=0x7ffd9572a250 "") at pquery.c:1381
#3  0x0000000000a7c2b5 in PortalRunMulti (portal=portal@entry=0x287cc58, isTopLevel=isTopLevel@entry=1 '\001', dest=dest@entry=0x2672728, 
    altdest=altdest@entry=0x2672728, completionTag=completionTag@entry=0x7ffd9572a250 "") at pquery.c:1512
#4  0x0000000000a7d711 in PortalRun (portal=portal@entry=0x287cc58, count=count@entry=9223372036854775807, isTopLevel=isTopLevel@entry=1 '\001', 
    dest=dest@entry=0x2672728, altdest=altdest@entry=0x2672728, completionTag=completionTag@entry=0x7ffd9572a250 "") at pquery.c:1018
#5  0x0000000000a78514 in exec_simple_query (query_string=0x2670e08 "CREATE RESOURCE GROUP rgroup1 WITH (CPU_RATE_LIMIT=5, MEMORY_LIMIT=25, MEMORY_SPILL_RATIO=20);")
    at postgres.c:1824
#6  0x0000000000a7b470 in PostgresMain (argc=<optimized out>, argv=argv@entry=0x2650788, dbname=0x2650640 "tpch1s", username=<optimized out>) at postgres.c:5246
#7  0x00000000006bf09b in BackendRun (port=0x2680890) at postmaster.c:4811
#8  BackendStartup (port=0x2680890) at postmaster.c:4468
#9  ServerLoop () at postmaster.c:1948
#10 0x0000000000a025c9 in PostmasterMain (argc=argc@entry=6, argv=argv@entry=0x264e810) at postmaster.c:1518
#11 0x00000000006c3e4b in main (argc=6, argv=0x264e810) at main.c:245

查询开始时,资源登记

#0  ResLockUpdateLimit (increment=1 '\001', proclock=0x7fb305aa29b0, inError=0 '\000', incrementSet=0x7fb308a6cf68, lock=0x7fb305517130) at resqueue.c:834
#1  ResLockAcquire (locktag=locktag@entry=0x7ffebec95270, incrementSet=<optimized out>,	incrementSet@entry=0x7ffebec95280) at resqueue.c:406
#2  0x0000000000c22a08 in ResLockPortal (portal=portal@entry=0x3bc5148,	qDesc=qDesc@entry=0x3a8c1b8) at resscheduler.c:684
#3  0x0000000000a7d366 in PortalStart (portal=portal@entry=0x3bc5148, params=params@entry=0x0, eflags=eflags@entry=0, snapshot=snapshot@entry=0x0, ddesc=ddesc@entry=0x0) at pquery.c:713
#4  0x0000000000a784a4 in exec_simple_query (query_string=0x38d0c88 "select\n\tl_returnflag,\n\tl_linestatus,\n\tsum(l_quantity) as sum_qty,\n\tsum(l_extendedprice) as sum_base_price,\n\tsum(l_extendedprice * (1 - l_discount)) as sum_disc_price,\n\tsum(l_extendedprice * (1 - l_dis"...) at postgres.c:1785
#5  0x0000000000a7b470 in PostgresMain (argc=<optimized out>, argv=argv@entry=0x38b06b8, dbname=0x38b0570 "tpch1s", username=<optimized out>) at postgres.c:5246

查询结束时,资源注销

#0  ResLockUpdateLimit (proclock=0x7fb305aa29b0, inError=0 '\000', increment=0 '\000', incrementSet=0x7fb308a6cf68, lock=0x7fb305517130) at resqueue.c:834
#1  ResLockRelease (locktag=locktag@entry=0x7ffebec953d0, resPortalId=0) at resqueue.c:605
#2  0x0000000000c22d5f in ResUnLockPortal (portal=portal@entry=0x3bc5148) at resscheduler.c:852
#3  0x000000000085fc2a in PortalCleanup (portal=0x3bc5148) at portalcmds.c:344
#4  0x0000000000c102ca in PortalDrop (portal=0x3bc5148,	isTopCommit=<optimized out>) at portalmem.c:535

四、dispatcher资源控制

在查询一开始时通过以上函数获得内存保留值:

void
PortalStart(Portal portal, ParamListInfo params,
			int eflags, Snapshot snapshot,
			QueryDispatchDesc *ddesc)
{
// …
// query_mem = 0表示无限制,单位:Byte。
queryDesc->plannedstmt->query_mem = ResourceManagerGetQueryMemoryLimit(queryDesc->plannedstmt);

ResourceManagerGetQueryMemoryLimit函数获得查询内存资源配额:

// Calculate the amount of memory reserved for the query
int64
ResourceManagerGetQueryMemoryLimit(PlannedStmt* stmt)
{
	if (Gp_role != GP_ROLE_DISPATCH)
		return 0;

	/* no limits in single user mode. */
	if (!IsUnderPostmaster)   // 如果是postmaster的子进程,则IsUnderPostmaster为true。
		return 0;

	Assert(gp_session_id > -1);
	Assert(ActivePortal != NULL);

	if (IsResQueueEnabled())
		return ResourceQueueGetQueryMemoryLimit(stmt, ActivePortal->queueId);
	if (IsResGroupActivated())
		return ResourceGroupGetQueryMemoryLimit();

	return 0;
}

ResourceGroupGetQueryMemoryLimit需要知道每个segment可以使用的内存总额,这个总额是由decideTotalChunks函数计算得到的:

// Calculate the total memory chunks of the segment
static void
decideTotalChunks(int32 *totalChunks, int32 *chunkSizeInBits)
{
  ……
	nsegments = Gp_role == GP_ROLE_EXECUTE ? host_segments : pResGroupControl->segmentsOnMaster;
  // ResGroupOps_GetTotalMemory获得系统可用内存:
  // RAM * overcommit_ratio + Swap
	tmptotalChunks = ResGroupOps_GetTotalMemory() * gp_resource_group_memory_limit / nsegments;

	/* If vmem is larger than 16GB (i.e., 16K MB), we make the chunks bigger
	 * so that the vmem limit in chunks unit is not larger than 16K.*/
	tmpchunkSizeInBits = BITS_IN_MB;
	while(tmptotalChunks > (16 * 1024))
	{
		tmpchunkSizeInBits++;
		tmptotalChunks >>= 1;
	}
  ……
}

五、资源队列

获得资源队列内存限制值

uint64 ResourceQueueGetQueryMemoryLimit(PlannedStmt *stmt, Oid queueId)

ResourceQueueGetQueryMemoryLimit分配内存原则:

// 超级用户,不限制内存。
if (superuser())                                                                                                                                                                                                                                         x
    return ResourceQueueGetSuperuserQueryMemoryLimit();
// gp_resqueue_memory_policy参数为none时,不限制内存。
if (IsResManagerMemoryPolicyNone())                                                                                                                                                                                                                      x
    return 0;
// 先按照并发数limit以及计划的cost,计算出等分内存的最小比例:
double minRatio = Min( 1.0/ (double) numSlots, planCost / costLimit);
……
// 以上按比例计算出来的内存大小,如果小于statement_mem,则使用statement_mem。
if (queryMem < (uint64) statement_mem * 1024L)
{
    queryMem = (uint64) statement_mem * 1024L;
}

六、算子buffer

在dispatcher中,给算子分配内存配额:

src\backend\executor\execMain.c:
void standard_ExecutorStart(QueryDesc *queryDesc, int eflags)
{
  ……
	/*Distribute memory to operators.*/
	if (Gp_role == GP_ROLE_DISPATCH)
	{
		……
			switch(*gp_resmanager_memory_policy)
			{
				case RESMANAGER_MEMORY_POLICY_AUTO:
					PolicyAutoAssignOperatorMemoryKB(queryDesc->plannedstmt,
													 queryDesc->plannedstmt->query_mem);
					break;
				case RESMANAGER_MEMORY_POLICY_EAGER_FREE:
					PolicyEagerFreeAssignOperatorMemoryKB(queryDesc->plannedstmt,
														  queryDesc->plannedstmt->query_mem);
					break;
				default:
					Assert(IsResManagerMemoryPolicyNone());
					break;
			}

计算算子buffer配额大小的函数:

src\backend\executor\execUtils.c:
uint64 PlanStateOperatorMemKB(const PlanState *ps)
{
	Assert(ps);
	Assert(ps->plan);
	uint64 result = 0;
	if (ps->plan->operatorMemKB == 0)
	{
		/**
		 * There are some statements that do not go through the resource queue and these
		 * plans dont get decorated with the operatorMemKB. Someday, we should fix resource queues.
		 */
		result = work_mem;
	}
	else
	{
		if (IsA(ps, AggState))
		{
			/* Retrieve all relinquished memory (quota the other node not using) */
			result = ps->plan->operatorMemKB + (MemoryAccounting_RequestQuotaIncrease() >> 10);
		}
		else
			result = ps->plan->operatorMemKB;
	}
	
	return result;
}

例如,aggregate在创建hash表时,确定hash表最大可用内存:

HashAggTable *
create_agg_hash_table(AggState *aggstate)
{
	……
	HashAggTable *hashtable;
	……
	hashtable = (HashAggTable *) palloc0(sizeof(HashAggTable));
  ……
  uint64 operatorMemKB = PlanStateOperatorMemKB( (PlanState *) aggstate);
  ……
  hashtable->max_mem = 1024.0 * operatorMemKB;
  ……
}

七、参数控制

内存资源管理策略

gp_resqueue_memory_policy和gp_resgroup_memory_policy参数对应的枚举类型:

src\include\cdb\memquota.h:
typedef enum ResManagerMemoryPolicy
{
  RESMANAGER_MEMORY_POLICY_NONE,
  RESMANAGER_MEMORY_POLICY_AUTO,
  RESMANAGER_MEMORY_POLICY_EAGER_FREE
} ResManagerMemoryPolicy;

auto和eager_free两种策略对应的分配内存函数:

src\include\cdb\memquota.h:
extern void PolicyAutoAssignOperatorMemoryKB(PlannedStmt *stmt, uint64 memoryAvailable);
extern void PolicyEagerFreeAssignOperatorMemoryKB(PlannedStmt *stmt, uint64 memoryAvailable);

max_statement_mem无实际控制作用

安装GP文档说法,这个参数是对statement_mem的保护,但是从实际实验和代码看,没有实现文档所述功能。

src\backend\cdb\cdbvars.c:
bool
gpvars_check_statement_mem(int *newval, void **extra, GucSource source)
{
	if (*newval >= max_statement_mem)
	{
		GUC_check_errmsg("Invalid input for statement_mem, must be less than max_statement_mem (%d kB)",
						 max_statement_mem);
	}

	return true;
}

获得最新文章,请访问:http://www.200yi.com/ff_internal/

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值