Ogre--InstanceBatchHW_VTF_ogre instance-优快云博客

本文链接：https://blog.youkuaiyun.com/weixin_43846381/article/details/93406241

1、InstanceBatchHW_VTF 简介

InstanceBatchHW_VTF 全名 InstanceBatch hardware Vertex Texture Fetch 即用纹理单元存储骨骼仿射矩阵。创建同一个批次中模型时,将每个模型的骨头仿射矩阵存储到纹理单元中, 将纹理坐标和骨骼矩阵索引存储到顶点缓存里, 最后GPU利用该矩阵更新骨头顶点位置，进而产生动画效果。

2、InstanceBatchHW_VTF 创建

复制材质文件,如果没有阴影材质文件则创建。
删除顶点数据类型中骨骼混合权重和骨骼ID数据
创建顶点纹理单元添加到材质文件中
纹理单元宽度： 骨头数量、entity数量以及单个骨头占用多少个像素计算，如果宽度超过4096另起一行。( 例如骨头数量:18、entity:16 、单个对象长度:3 texture width = 18×16×3=864)
纹理单元高度： 根据计算所有entity创建的像素长度除以上面计算的纹理宽度

 void BaseInstanceBatchVTF::createVertexTexture( const SubMesh* baseSubMesh )
    {
        /*
        TODO: Find a way to retrieve max texture resolution,
        http://www.ogre3d.org/forums/viewtopic.php?t=38305

        Currently assuming it's 4096x4096, which is a safe bet for any hardware with decent VTF*/
        
        size_t uniqueAnimations = mInstancesPerBatch;
        if (useBoneMatrixLookup())
        {
            uniqueAnimations = std::min<size_t>(getMaxLookupTableInstances(), uniqueAnimations);
        }
        mMatricesPerInstance = std::max<size_t>( 1, baseSubMesh->blendIndexToBoneIndexMap.size() );

        if(mUseBoneDualQuaternions && !mTempTransformsArray3x4)
        {
            mTempTransformsArray3x4 = OGRE_ALLOC_T(float, mMatricesPerInstance * 3 * 4, MEMCATEGORY_GENERAL);
        }
        
        mNumWorldMatrices = uniqueAnimations * mMatricesPerInstance;

        //Calculate the width & height required to hold all the matrices. Start by filling the width
        //first (i.e. 4096x1 4096x2 4096x3, etc)
        
        size_t texWidth         = std::min<size_t>( mNumWorldMatrices * mRowLength, c_maxTexWidth );
        size_t maxUsableWidth   = texWidth;
        if( matricesTogetherPerRow() )
        {
            //The technique requires all matrices from the same instance in the same row
            //i.e. 4094 -> 4095 -> skip 4096 -> 0 (next row) contains data from a new instance 
            mWidthFloatsPadding = texWidth % (mMatricesPerInstance * mRowLength);

            if( mWidthFloatsPadding )
            {
                mMaxFloatsPerLine = texWidth - mWidthFloatsPadding;

                maxUsableWidth = mMaxFloatsPerLine;

                //Values are in pixels, convert them to floats (1 pixel = 4 floats)
                mWidthFloatsPadding *= 4;
                mMaxFloatsPerLine       *= 4;
            }
        }

        size_t texHeight = mNumWorldMatrices * mRowLength / maxUsableWidth;

        if( (mNumWorldMatrices * mRowLength) % maxUsableWidth )
            texHeight += 1;

        //Don't use 1D textures, as OGL goes crazy because the shader should be calling texture1D()...
        //TextureType texType = texHeight == 1 ? TEX_TYPE_1D : TEX_TYPE_2D;
        TextureType texType = TEX_TYPE_2D;

        mMatrixTexture = TextureManager::getSingleton().createManual(
                                        mName + "/VTF", mMeshReference->getGroup(), texType,
                                        (Ouint)texWidth, (Ouint)texHeight,
                                        0, PF_FLOAT32_RGBA, TU_DYNAMIC_WRITE_ONLY_DISCARDABLE );

        //Set our cloned material to use this custom texture!
        setupMaterialToUseVTF( texType, mMaterial );
    }

添加顶点缓存
(1)创建骨骼像素索引顶点缓存
创建float4纹理坐标类型顶点缓存,计算entity中每个顶点对应的骨头在纹理单元中对应的骨骼矩阵的U偏移值（其实就是骨头中的每个像素在单个entity中所有骨头像素位置）
注意在骨骼动画模型中顶点缓存中每个顶点存储信息{顶点坐标、法向量、纹理坐标、对应的骨骼ID、骨骼权重}。
(2)创建每个entity纹理起始位置的顶点缓存
创建float2类型,存储纹理坐标

(3) mInstanceVertexBuffer->setInstanceDataStepRate( 1 ) 这一句对应着opengl中接口多实例的顶点属性接口glVertexAttribDivisorARB(GLuint index, GLuint divisor)，第一个参数表示layout索引，第二个参数指定顶点属性的更新方式，默认是0表示着色器每次执行时更新属性数据，填写1表示每个实例更新一次属性数据，填写2则表示每2个实例更新一次属性数据，依次类推。上面填写1则通知了OpenGL这是一个instance array，每个实例更新一次数据例如：
绘制100个矩形glVertexAttribDivisor(2, 1)

绘制100个矩形glVertexAttribDivisor(2, 4) 每4个实例更新一次数据时，我们将会得到100 / 4 =25个矩形，因为每4个矩形的模型变换矩阵相同，因此放在了同一个位置，重合了

  void InstanceBatchHW_VTF::createVertexSemantics( VertexData *thisVertexData,
                                                         VertexData *baseVertexData,
                                                         const HWBoneIdxVec &hwBoneIdx,
                                                         const HWBoneWgtVec& hwBoneWgt)
    {
        const float texWidth  = static_cast<float>(mMatrixTexture->getWidth());

        //Only one weight per vertex is supported. It would not only be complex, but prohibitively slow.
        //Put them in a new buffer, since it's 16 bytes aligned :-)
        unsigned short newSource = thisVertexData->vertexDeclaration->getMaxSource() + 1;

        size_t offset = 0;

        size_t maxFloatsPerVector = 4;

        //Can fit two dual quaternions in every float4, but only one 3x4 matrix
        for(size_t i = 0; i < mWeightCount; i += maxFloatsPerVector / mRowLength)
        {
            offset += thisVertexData->vertexDeclaration->addElement( newSource, offset, VET_FLOAT4, VES_TEXTURE_COORDINATES,
                                        thisVertexData->vertexDeclaration->getNextFreeTextureCoordinate() ).getSize();
        }

        //Add the weights (supports up to four, which is Ogre's limit)
        if(mWeightCount > 1)
        {
            thisVertexData->vertexDeclaration->addElement(newSource, offset, VET_FLOAT4, VES_BLEND_WEIGHTS,
                                        0 ).getSize();
        }
        
        //Create our own vertex buffer
        HardwareVertexBufferSharedPtr vertexBuffer =
            HardwareBufferManager::getSingleton().createVertexBuffer(
            thisVertexData->vertexDeclaration->getVertexSize(newSource),
            thisVertexData->vertexCount,
            HardwareBuffer::HBU_STATIC_WRITE_ONLY );
        thisVertexData->vertexBufferBinding->setBinding( newSource, vertexBuffer );

        float *thisFloat = static_cast<float*>(vertexBuffer->lock(HardwareBuffer::HBL_DISCARD));

        //Create the UVs to sample from the right bone/matrix
        for( size_t j=0; j < baseVertexData->vertexCount * mWeightCount; j += mWeightCount)
        {
            size_t numberOfMatricesInLine = 0;
            
            //Write the matrices, adding padding as needed
            for(size_t i = 0; i < mWeightCount; ++i)
            {
                //计算单个顶点在纹理单元中对应的骨骼矩阵索引
                for( size_t k=0; k < mRowLength; ++k)
                {
                    size_t instanceIdx = hwBoneIdx[j+i] * mRowLength + k;
                    *thisFloat++ = instanceIdx / texWidth;
                }

                ++numberOfMatricesInLine;

                //纹理坐标缓存申请长度是float4，上面已经填充三个骨骼矩阵索引，这里自动补齐
                if((numberOfMatricesInLine + 1) * mRowLength > maxFloatsPerVector || (i+1) == mWeightCount)
                {
                    //Place zeroes in the remaining coordinates
                    for ( size_t k=mRowLength * numberOfMatricesInLine; k < maxFloatsPerVector; ++k)
                    {
                        *thisFloat++ = 0.0f;
                    }

                    numberOfMatricesInLine = 0;
                }
            }

            //Don't need to write weights if there is only one
            if(mWeightCount > 1)
            {
                //Write the weights
                for(size_t i = 0; i < mWeightCount; ++i)
                {
                    *thisFloat++ = hwBoneWgt[j+i];
                }

                //Write the empty space
                for(size_t i = mWeightCount; i < maxFloatsPerVector; ++i)
                {
                    *thisFloat++ = 0.0f;
                }
            }
        }

        vertexBuffer->unlock();

        //创建批次中所有entity 获取纹理矩阵坐标的缓存
        newSource = thisVertexData->vertexDeclaration->getMaxSource() + 1;
        offset = thisVertexData->vertexDeclaration->addElement( newSource, 0, VET_FLOAT2, VES_TEXTURE_COORDINATES,
                                    thisVertexData->vertexDeclaration->getNextFreeTextureCoordinate() ).getSize();
        if (useBoneMatrixLookup())
        {
            //if using bone matrix lookup we will need to add 3 more float4 to contain the matrix. containing
            //the personal world transform of each entity.
            offset += thisVertexData->vertexDeclaration->addElement( newSource, offset, VET_FLOAT4, VES_TEXTURE_COORDINATES,
                thisVertexData->vertexDeclaration->getNextFreeTextureCoordinate() ).getSize();
            offset += thisVertexData->vertexDeclaration->addElement( newSource, offset, VET_FLOAT4, VES_TEXTURE_COORDINATES,
                thisVertexData->vertexDeclaration->getNextFreeTextureCoordinate() ).getSize();
            thisVertexData->vertexDeclaration->addElement( newSource, offset, VET_FLOAT4, VES_TEXTURE_COORDINATES,
                thisVertexData->vertexDeclaration->getNextFreeTextureCoordinate() ).getSize();
            //Add two floats of padding here? or earlier?
            //If not using bone matrix lookup, is it ok that it is 8 bytes since divides evenly into 16

        }

        //Create our own vertex buffer
        mInstanceVertexBuffer = HardwareBufferManager::getSingleton().createVertexBuffer(
                                        thisVertexData->vertexDeclaration->getVertexSize(newSource),
                                        mInstancesPerBatch,
                                        HardwareBuffer::HBU_STATIC_WRITE_ONLY );
        thisVertexData->vertexBufferBinding->setBinding( newSource, mInstanceVertexBuffer );

        //Mark this buffer as instanced
        mInstanceVertexBuffer->setIsInstanceData( true );
        mInstanceVertexBuffer->setInstanceDataStepRate( 1 );

        updateInstanceDataBuffer(true, NULL);
    }

往顶点二维顶点纹理类型缓存中写入纹理坐标数据
注意这里计算存储的是每个entity在纹理单元中起始的纹理坐标
如下代码：

 //updates the vertex buffer containing the per instance data
    size_t InstanceBatchHW_VTF::updateInstanceDataBuffer(bool isFirstTime, Camera* currentCamera)
    {
        size_t visibleEntityCount = 0;
        bool useMatrixLookup = useBoneMatrixLookup();
        if (isFirstTime ^ useMatrixLookup)
        {
            //update the mTransformLookupNumber value in the entities if needed 
            updateSharedLookupIndexes();

            const float texWidth  = static_cast<float>(mMatrixTexture->getWidth());
            const float texHeight = static_cast<float>(mMatrixTexture->getHeight());

            //这里计算纹理偏移目的是避免纹理采样每个的像素边缘
            Vector2 texelOffsets;
            //RenderSystem *renderSystem = Root::getSingleton().getRenderSystem();
            texelOffsets.x = /*renderSystem->getHorizontalTexelOffset()*/ -0.5f / texWidth;
            texelOffsets.y = /*renderSystem->getHorizontalTexelOffset()*/ -0.5f / texHeight;

            float *thisVec = static_cast<float*>(mInstanceVertexBuffer->lock(HardwareBuffer::HBL_DISCARD));

            const size_t maxPixelsPerLine = std::min( static_cast<size_t>(mMatrixTexture->getWidth()), mMaxFloatsPerLine >> 2 );

            //Calculate UV offsets, which change per instance
            for( size_t i=0; i<mInstancesPerBatch; ++i )
            {
                InstancedEntity* entity = useMatrixLookup ? mInstancedEntities[i] : NULL;
                if  //Update if we are not using a lookup bone matrix method. In this case the function will 
                    //be called only once
                    (!useMatrixLookup || 
                    //Update if we are in the visible range of the camera (for look up bone matrix method
                    //and static mode).
                    (entity->findVisible(currentCamera)))
                {
                  //计算纹理x y值存到缓存中，注意instanceIdx = entity索引 * 骨骼数*单个骨骼像素长度(3)
                  //所以这里存储的是每个entity在纹理单元中起始的纹理坐标
                    size_t matrixIndex = useMatrixLookup ? entity->mTransformLookupNumber : i;
                    size_t instanceIdx = matrixIndex * mMatricesPerInstance * mRowLength;
                    *thisVec = ((instanceIdx % maxPixelsPerLine) / texWidth) - (float)(texelOffsets.x);
                    *(thisVec + 1) = ((instanceIdx / maxPixelsPerLine) / texHeight) - (float)(texelOffsets.y);
                    thisVec += 2;

                    if (useMatrixLookup)
                    {
                        const Matrix4& mat =  entity->_getParentNodeFullTransform();
                        *(thisVec)     = static_cast<float>( mat[0][0] );
                        *(thisVec + 1) = static_cast<float>( mat[0][1] );
                        *(thisVec + 2) = static_cast<float>( mat[0][2] );
                        *(thisVec + 3) = static_cast<float>( mat[0][3] );
                        *(thisVec + 4) = static_cast<float>( mat[1][0] );
                        *(thisVec + 5) = static_cast<float>( mat[1][1] );
                        *(thisVec + 6) = static_cast<float>( mat[1][2] );
                        *(thisVec + 7) = static_cast<float>( mat[1][3] );
                        *(thisVec + 8) = static_cast<float>( mat[2][0] );
                        *(thisVec + 9) = static_cast<float>( mat[2][1] );
                        *(thisVec + 10)= static_cast<float>( mat[2][2] );
                        *(thisVec + 11)= static_cast<float>( mat[2][3] );
                        if(currentCamera && mManager->getCameraRelativeRendering()) // && useMatrixLookup
                        {
                            const Vector3 &cameraRelativePosition = currentCamera->getDerivedPosition();
                            *(thisVec + 3) -= static_cast<float>( cameraRelativePosition.x );
                            *(thisVec + 7) -= static_cast<float>( cameraRelativePosition.y );
                            *(thisVec + 11) -=  static_cast<float>( cameraRelativePosition.z );
                        }
                        thisVec += 12;
                    }
                    ++visibleEntityCount;
                }
            }

            mInstanceVertexBuffer->unlock();
        }
        else
        {
            visibleEntityCount = mInstancedEntities.size();
        }
        return visibleEntityCount;
    }

3、InstanceBatchHW_VTF 写入矩阵纹理数据

将每个entity骨头矩阵写入到纹理数据中
如下代码：

 size_t InstanceBatchHW_VTF::updateVertexTexture( Camera *currentCamera )
    {
        size_t renderedInstances = 0;
        bool useMatrixLookup = useBoneMatrixLookup();
        if (useMatrixLookup)
        {
            //if we are using bone matrix look up we have to update the instance buffer for the 
            //vertex texture to be relevant

            //also note that in this case the number of instances to render comes directly from the 
            //updateInstanceDataBuffer() function, not from this function.
            renderedInstances = updateInstanceDataBuffer(false, currentCamera);
        }

        
        mDirtyAnimation = false;

        //Now lock the texture and copy the 4x3 matrices!
        mMatrixTexture->getBuffer()->lock( HardwareBuffer::HBL_DISCARD );
        const PixelBox &pixelBox = mMatrixTexture->getBuffer()->getCurrentLock();

        float *pSource = static_cast<float*>(pixelBox.data);
        
        InstancedEntityVec::const_iterator itor = mInstancedEntities.begin();
        
        vector<bool>::type writtenPositions(getMaxLookupTableInstances(), false);

        size_t floatPerEntity = mMatricesPerInstance * mRowLength * 4;
        size_t entitiesPerPadding = (size_t)(mMaxFloatsPerLine / floatPerEntity);
        
        size_t instanceCount = mInstancedEntities.size();
        size_t updatedInstances = 0;

        float* transforms = NULL;
        //If using dual quaternions, write 3x4 matrices to a temporary buffer, then convert to dual quaternions
        if(mUseBoneDualQuaternions)
        {
            transforms = mTempTransformsArray3x4;
        }
        
        for(size_t i = 0 ; i < instanceCount ; ++i)
        {
            InstancedEntity* entity = mInstancedEntities[i];
            size_t textureLookupPosition = updatedInstances;
            if (useMatrixLookup)
            {
                textureLookupPosition = entity->mTransformLookupNumber;
            }
            //Check that we are not using a lookup matrix or that we have not already written
            //The bone data
            if (((!useMatrixLookup) || !writtenPositions[entity->mTransformLookupNumber]) &&
                //Cull on an individual basis, the less entities are visible, the less instances we draw.
                //No need to use null matrices at all!
                (entity->findVisible( currentCamera )))
            {
                float* pDest = pSource + floatPerEntity * textureLookupPosition + 
                    (size_t)(textureLookupPosition / entitiesPerPadding) * mWidthFloatsPadding;

                if(!mUseBoneDualQuaternions)
                {
                    transforms = pDest;
                }
                
                if( mMeshReference->hasSkeleton() )
                    mDirtyAnimation |= entity->_updateAnimation();

                size_t floatsWritten = entity->getTransforms3x4( transforms );

                if( !useMatrixLookup && mManager->getCameraRelativeRendering() )
                    makeMatrixCameraRelative3x4( transforms, floatsWritten );

                if(mUseBoneDualQuaternions)
                {
                    convert3x4MatricesToDualQuaternions(transforms, floatsWritten / 12, pDest);
                }

                if (useMatrixLookup)
                {
                    writtenPositions[entity->mTransformLookupNumber] = true;
                }
                else
                {
                    ++updatedInstances;
                }
            }

            ++itor;
        }

        if (!useMatrixLookup)
        {
            renderedInstances = updatedInstances;
        }

        mMatrixTexture->getBuffer()->unlock();

        return renderedInstances;
    }

4、InstanceBatchHW_VTF GPU顶点着色器数据更新

vertex shader中
attribute vec4 uv1 //单个骨头3个像素偏移值 w为0
attribute vec4 uv2 //对象对应着entity起始纹理坐标

代码注释详解如下代码：

#version 120

//Vertex input
attribute vec4 vertex;
attribute vec3 normal;

#ifdef BONE_TWO_WEIGHTS
	attribute vec4 blendWeights;
#endif

attribute vec4 uv0;
attribute vec4 uv1;
attribute vec4 uv2;
	
#if BONE_MATRIX_LUT
	attribute vec4 uv3;
	attribute vec4 uv4;
	attribute vec4 uv5;
#endif

attribute vec3 tangent;

下面省略。。。。。。。。。。。。。