1、InstanceBatchHW_VTF 简介
InstanceBatchHW_VTF 全名 InstanceBatch hardware Vertex Texture Fetch 即用纹理单元存储骨骼仿射矩阵。 创建同一个批次中模型时,将每个模型的骨头仿射矩阵存储到纹理单元中, 将纹理坐标和骨骼矩阵索引存储到顶点缓存里, 最后GPU利用该矩阵更新骨头顶点位置,进而产生动画效果。
2、InstanceBatchHW_VTF 创建
- 复制材质文件,如果没有阴影材质文件则创建。
- 删除顶点数据类型中骨骼混合权重和骨骼ID数据
- 创建顶点纹理单元添加到材质文件中
纹理单元宽度: 骨头数量、entity数量以及单个骨头占用多少个像素计算,如果宽度超过4096另起一行。( 例如骨头数量:18、entity:16 、单个对象长度:3 texture width = 18×16×3=864)
纹理单元高度: 根据计算所有entity创建的像素长度除以上面计算的纹理宽度
void BaseInstanceBatchVTF::createVertexTexture( const SubMesh* baseSubMesh )
{
/*
TODO: Find a way to retrieve max texture resolution,
http://www.ogre3d.org/forums/viewtopic.php?t=38305
Currently assuming it's 4096x4096, which is a safe bet for any hardware with decent VTF*/
size_t uniqueAnimations = mInstancesPerBatch;
if (useBoneMatrixLookup())
{
uniqueAnimations = std::min<size_t>(getMaxLookupTableInstances(), uniqueAnimations);
}
mMatricesPerInstance = std::max<size_t>( 1, baseSubMesh->blendIndexToBoneIndexMap.size() );
if(mUseBoneDualQuaternions && !mTempTransformsArray3x4)
{
mTempTransformsArray3x4 = OGRE_ALLOC_T(float, mMatricesPerInstance * 3 * 4, MEMCATEGORY_GENERAL);
}
mNumWorldMatrices = uniqueAnimations * mMatricesPerInstance;
//Calculate the width & height required to hold all the matrices. Start by filling the width
//first (i.e. 4096x1 4096x2 4096x3, etc)
size_t texWidth = std::min<size_t>( mNumWorldMatrices * mRowLength, c_maxTexWidth );
size_t maxUsableWidth = texWidth;
if( matricesTogetherPerRow() )
{
//The technique requires all matrices from the same instance in the same row
//i.e. 4094 -> 4095 -> skip 4096 -> 0 (next row) contains data from a new instance
mWidthFloatsPadding = texWidth % (mMatricesPerInstance * mRowLength);
if( mWidthFloatsPadding )
{
mMaxFloatsPerLine = texWidth - mWidthFloatsPadding;
maxUsableWidth = mMaxFloatsPerLine;
//Values are in pixels, convert them to floats (1 pixel = 4 floats)
mWidthFloatsPadding *= 4;
mMaxFloatsPerLine *= 4;
}
}
size_t texHeight = mNumWorldMatrices * mRowLength / maxUsableWidth;
if( (mNumWorldMatrices * mRowLength) % maxUsableWidth )
texHeight += 1;
//Don't use 1D textures, as OGL goes crazy because the shader should be calling texture1D()...
//TextureType texType = texHeight == 1 ? TEX_TYPE_1D : TEX_TYPE_2D;
TextureType texType = TEX_TYPE_2D;
mMatrixTexture = TextureManager::getSingleton().createManual(
mName + "/VTF", mMeshReference->getGroup(), texType,
(Ouint)texWidth, (Ouint)texHeight,
0, PF_FLOAT32_RGBA, TU_DYNAMIC_WRITE_ONLY_DISCARDABLE );
//Set our cloned material to use this custom texture!
setupMaterialToUseVTF( texType, mMaterial );
}
- 添加顶点缓存
(1)创建骨骼像素索引顶点缓存
创建float4纹理坐标类型顶点缓存,计算entity中每个顶点对应的骨头在纹理单元中对应的骨骼矩阵的U偏移值(其实就是骨头中的每个像素在单个entity中所有骨头像素位置)
注意 在骨骼动画模型中顶点缓存中每个顶点存储信息{顶点坐标、法向量、纹理坐标、对应的骨骼ID、骨骼权重}。
(2)创建每个entity纹理起始位置的顶点缓存
创建float2类型,存储纹理坐标
(3) mInstanceVertexBuffer->setInstanceDataStepRate( 1 ) 这一句对应着opengl中 接口多实例的顶点属性接口glVertexAttribDivisorARB(GLuint index, GLuint divisor),第一个参数表示layout索引,第二个参数指定顶点属性的更新方式,默认是0表示着色器每次执行时更新属性数据,填写1表示每个实例更新一次属性数据,填写2则表示每2个实例更新一次属性数据,依次类推。上面填写1则通知了OpenGL这是一个instance array,每个实例更新一次数据例如:
绘制100个矩形glVertexAttribDivisor(2, 1)
绘制100个矩形glVertexAttribDivisor(2, 4) 每4个实例更新一次数据时,我们将会得到100 / 4 =25个矩形,因为每4个矩形的模型变换矩阵相同,因此放在了同一个位置,重合了
void InstanceBatchHW_VTF::createVertexSemantics( VertexData *thisVertexData,
VertexData *baseVertexData,
const HWBoneIdxVec &hwBoneIdx,
const HWBoneWgtVec& hwBoneWgt)
{
const float texWidth = static_cast<float>(mMatrixTexture->getWidth());
//Only one weight per vertex is supported. It would not only be complex, but prohibitively slow.
//Put them in a new buffer, since it's 16 bytes aligned :-)
unsigned short newSource = thisVertexData->vertexDeclaration->getMaxSource() + 1;
size_t offset = 0;
size_t maxFloatsPerVector = 4;
//Can fit two dual quaternions in every float4, but only one 3x4 matrix
for(size_t i = 0; i < mWeightCount; i += maxFloatsPerVector / mRowLength)
{
offset += thisVertexData->vertexDeclaration->addElement( newSource, offset, VET_FLOAT4, VES_TEXTURE_COORDINATES,
thisVertexData->vertexDeclaration->getNextFreeTextureCoordinate() ).getSize();
}
//Add the weights (supports up to four, which is Ogre's limit)
if(mWeightCount > 1)
{
thisVertexData->vertexDeclaration->addElement(newSource, offset, VET_FLOAT4, VES_BLEND_WEIGHTS,
0 ).getSize();
}
//Create our own vertex buffer
HardwareVertexBufferSharedPtr vertexBuffer =
HardwareBufferManager::getSingleton().createVertexBuffer(
thisVertexData->vertexDeclaration->getVertexSize(newSource),
thisVertexData->vertexCount,
HardwareBuffer::HBU_STATIC_WRITE_ONLY );
thisVertexData->vertexBufferBinding->setBinding( newSource, vertexBuffer );
float *thisFloat = static_cast<float*>(vertexBuffer->lock(HardwareBuffer::HBL_DISCARD));
//Create the UVs to sample from the right bone/matrix
for( size_t j=0; j < baseVertexData->vertexCount * mWeightCount; j += mWeightCount)
{
size_t numberOfMatricesInLine = 0;
//Write the matrices, adding padding as needed
for(size_t i = 0; i < mWeightCount; ++i)
{
//计算单个顶点在纹理单元中对应的骨骼矩阵索引
for( size_t k=0; k < mRowLength; ++k)
{
size_t instanceIdx = hwBoneIdx[j+i] * mRowLength + k;
*thisFloat++ = instanceIdx / texWidth;
}
++numberOfMatricesInLine;
//纹理坐标缓存申请长度是float4,上面已经填充三个骨骼矩阵索引,这里自动补齐
if((numberOfMatricesInLine + 1) * mRowLength > maxFloatsPerVector || (i+1) == mWeightCount)
{
//Place zeroes in the remaining coordinates
for ( size_t k=mRowLength * numberOfMatricesInLine; k < maxFloatsPerVector; ++k)
{
*thisFloat++ = 0.0f;
}
numberOfMatricesInLine = 0;
}
}
//Don't need to write weights if there is only one
if(mWeightCount > 1)
{
//Write the weights
for(size_t i = 0; i < mWeightCount; ++i)
{
*thisFloat++ = hwBoneWgt[j+i];
}
//Write the empty space
for(size_t i = mWeightCount; i < maxFloatsPerVector; ++i)
{
*thisFloat++ = 0.0f;
}
}
}
vertexBuffer->unlock();
//创建批次中所有entity 获取纹理矩阵坐标的缓存
newSource = thisVertexData->vertexDeclaration->getMaxSource() + 1;
offset = thisVertexData->vertexDeclaration->addElement( newSource, 0, VET_FLOAT2, VES_TEXTURE_COORDINATES,
thisVertexData->vertexDeclaration->getNextFreeTextureCoordinate() ).getSize();
if (useBoneMatrixLookup())
{
//if using bone matrix lookup we will need to add 3 more float4 to contain the matrix. containing
//the personal world transform of each entity.
offset += thisVertexData->vertexDeclaration->addElement( newSource, offset, VET_FLOAT4, VES_TEXTURE_COORDINATES,
thisVertexData->vertexDeclaration->getNextFreeTextureCoordinate() ).getSize();
offset += thisVertexData->vertexDeclaration->addElement( newSource, offset, VET_FLOAT4, VES_TEXTURE_COORDINATES,
thisVertexData->vertexDeclaration->getNextFreeTextureCoordinate() ).getSize();
thisVertexData->vertexDeclaration->addElement( newSource, offset, VET_FLOAT4, VES_TEXTURE_COORDINATES,
thisVertexData->vertexDeclaration->getNextFreeTextureCoordinate() ).getSize();
//Add two floats of padding here? or earlier?
//If not using bone matrix lookup, is it ok that it is 8 bytes since divides evenly into 16
}
//Create our own vertex buffer
mInstanceVertexBuffer = HardwareBufferManager::getSingleton().createVertexBuffer(
thisVertexData->vertexDeclaration->getVertexSize(newSource),
mInstancesPerBatch,
HardwareBuffer::HBU_STATIC_WRITE_ONLY );
thisVertexData->vertexBufferBinding->setBinding( newSource, mInstanceVertexBuffer );
//Mark this buffer as instanced
mInstanceVertexBuffer->setIsInstanceData( true );
mInstanceVertexBuffer->setInstanceDataStepRate( 1 );
updateInstanceDataBuffer(true, NULL);
}
- 往顶点二维顶点纹理类型缓存中写入纹理坐标数据
注意这里计算存储的是每个entity在纹理单元中起始的纹理坐标
如下代码:
//updates the vertex buffer containing the per instance data
size_t InstanceBatchHW_VTF::updateInstanceDataBuffer(bool isFirstTime, Camera* currentCamera)
{
size_t visibleEntityCount = 0;
bool useMatrixLookup = useBoneMatrixLookup();
if (isFirstTime ^ useMatrixLookup)
{
//update the mTransformLookupNumber value in the entities if needed
updateSharedLookupIndexes();
const float texWidth = static_cast<float>(mMatrixTexture->getWidth());
const float texHeight = static_cast<float>(mMatrixTexture->getHeight());
//这里计算纹理偏移目的是避免纹理采样每个的像素边缘
Vector2 texelOffsets;
//RenderSystem *renderSystem = Root::getSingleton().getRenderSystem();
texelOffsets.x = /*renderSystem->getHorizontalTexelOffset()*/ -0.5f / texWidth;
texelOffsets.y = /*renderSystem->getHorizontalTexelOffset()*/ -0.5f / texHeight;
float *thisVec = static_cast<float*>(mInstanceVertexBuffer->lock(HardwareBuffer::HBL_DISCARD));
const size_t maxPixelsPerLine = std::min( static_cast<size_t>(mMatrixTexture->getWidth()), mMaxFloatsPerLine >> 2 );
//Calculate UV offsets, which change per instance
for( size_t i=0; i<mInstancesPerBatch; ++i )
{
InstancedEntity* entity = useMatrixLookup ? mInstancedEntities[i] : NULL;
if //Update if we are not using a lookup bone matrix method. In this case the function will
//be called only once
(!useMatrixLookup ||
//Update if we are in the visible range of the camera (for look up bone matrix method
//and static mode).
(entity->findVisible(currentCamera)))
{
//计算纹理x y值存到缓存中,注意instanceIdx = entity索引 * 骨骼数*单个骨骼像素长度(3)
//所以这里存储的是每个entity在纹理单元中起始的纹理坐标
size_t matrixIndex = useMatrixLookup ? entity->mTransformLookupNumber : i;
size_t instanceIdx = matrixIndex * mMatricesPerInstance * mRowLength;
*thisVec = ((instanceIdx % maxPixelsPerLine) / texWidth) - (float)(texelOffsets.x);
*(thisVec + 1) = ((instanceIdx / maxPixelsPerLine) / texHeight) - (float)(texelOffsets.y);
thisVec += 2;
if (useMatrixLookup)
{
const Matrix4& mat = entity->_getParentNodeFullTransform();
*(thisVec) = static_cast<float>( mat[0][0] );
*(thisVec + 1) = static_cast<float>( mat[0][1] );
*(thisVec + 2) = static_cast<float>( mat[0][2] );
*(thisVec + 3) = static_cast<float>( mat[0][3] );
*(thisVec + 4) = static_cast<float>( mat[1][0] );
*(thisVec + 5) = static_cast<float>( mat[1][1] );
*(thisVec + 6) = static_cast<float>( mat[1][2] );
*(thisVec + 7) = static_cast<float>( mat[1][3] );
*(thisVec + 8) = static_cast<float>( mat[2][0] );
*(thisVec + 9) = static_cast<float>( mat[2][1] );
*(thisVec + 10)= static_cast<float>( mat[2][2] );
*(thisVec + 11)= static_cast<float>( mat[2][3] );
if(currentCamera && mManager->getCameraRelativeRendering()) // && useMatrixLookup
{
const Vector3 &cameraRelativePosition = currentCamera->getDerivedPosition();
*(thisVec + 3) -= static_cast<float>( cameraRelativePosition.x );
*(thisVec + 7) -= static_cast<float>( cameraRelativePosition.y );
*(thisVec + 11) -= static_cast<float>( cameraRelativePosition.z );
}
thisVec += 12;
}
++visibleEntityCount;
}
}
mInstanceVertexBuffer->unlock();
}
else
{
visibleEntityCount = mInstancedEntities.size();
}
return visibleEntityCount;
}
3、InstanceBatchHW_VTF 写入矩阵纹理数据
- 将每个entity骨头矩阵写入到纹理数据中
如下代码:
size_t InstanceBatchHW_VTF::updateVertexTexture( Camera *currentCamera )
{
size_t renderedInstances = 0;
bool useMatrixLookup = useBoneMatrixLookup();
if (useMatrixLookup)
{
//if we are using bone matrix look up we have to update the instance buffer for the
//vertex texture to be relevant
//also note that in this case the number of instances to render comes directly from the
//updateInstanceDataBuffer() function, not from this function.
renderedInstances = updateInstanceDataBuffer(false, currentCamera);
}
mDirtyAnimation = false;
//Now lock the texture and copy the 4x3 matrices!
mMatrixTexture->getBuffer()->lock( HardwareBuffer::HBL_DISCARD );
const PixelBox &pixelBox = mMatrixTexture->getBuffer()->getCurrentLock();
float *pSource = static_cast<float*>(pixelBox.data);
InstancedEntityVec::const_iterator itor = mInstancedEntities.begin();
vector<bool>::type writtenPositions(getMaxLookupTableInstances(), false);
size_t floatPerEntity = mMatricesPerInstance * mRowLength * 4;
size_t entitiesPerPadding = (size_t)(mMaxFloatsPerLine / floatPerEntity);
size_t instanceCount = mInstancedEntities.size();
size_t updatedInstances = 0;
float* transforms = NULL;
//If using dual quaternions, write 3x4 matrices to a temporary buffer, then convert to dual quaternions
if(mUseBoneDualQuaternions)
{
transforms = mTempTransformsArray3x4;
}
for(size_t i = 0 ; i < instanceCount ; ++i)
{
InstancedEntity* entity = mInstancedEntities[i];
size_t textureLookupPosition = updatedInstances;
if (useMatrixLookup)
{
textureLookupPosition = entity->mTransformLookupNumber;
}
//Check that we are not using a lookup matrix or that we have not already written
//The bone data
if (((!useMatrixLookup) || !writtenPositions[entity->mTransformLookupNumber]) &&
//Cull on an individual basis, the less entities are visible, the less instances we draw.
//No need to use null matrices at all!
(entity->findVisible( currentCamera )))
{
float* pDest = pSource + floatPerEntity * textureLookupPosition +
(size_t)(textureLookupPosition / entitiesPerPadding) * mWidthFloatsPadding;
if(!mUseBoneDualQuaternions)
{
transforms = pDest;
}
if( mMeshReference->hasSkeleton() )
mDirtyAnimation |= entity->_updateAnimation();
size_t floatsWritten = entity->getTransforms3x4( transforms );
if( !useMatrixLookup && mManager->getCameraRelativeRendering() )
makeMatrixCameraRelative3x4( transforms, floatsWritten );
if(mUseBoneDualQuaternions)
{
convert3x4MatricesToDualQuaternions(transforms, floatsWritten / 12, pDest);
}
if (useMatrixLookup)
{
writtenPositions[entity->mTransformLookupNumber] = true;
}
else
{
++updatedInstances;
}
}
++itor;
}
if (!useMatrixLookup)
{
renderedInstances = updatedInstances;
}
mMatrixTexture->getBuffer()->unlock();
return renderedInstances;
}
4、InstanceBatchHW_VTF GPU顶点着色器数据更新
vertex shader中
attribute vec4 uv1 //单个骨头3个像素偏移值 w为0
attribute vec4 uv2 //对象对应着entity起始纹理坐标
- 代码注释详解 如下代码:
#version 120
//Vertex input
attribute vec4 vertex;
attribute vec3 normal;
#ifdef BONE_TWO_WEIGHTS
attribute vec4 blendWeights;
#endif
attribute vec4 uv0;
attribute vec4 uv1;
attribute vec4 uv2;
#if BONE_MATRIX_LUT
attribute vec4 uv3;
attribute vec4 uv4;
attribute vec4 uv5;
#endif
attribute vec3 tangent;
下面省略。。。。。。。。。。。。。