基于Windows Sdk 与visual C++2008 在微软平台上构架自己的语音识别引擎(适用于windows 2

本文档介绍如何利用Windows SDK和Visual C++2008在微软平台上构建自己的语音识别引擎。项目开源,源码可提供。详细介绍了引擎的框架结构,包括核心类如CSrEngineAlternates、CSampleSRExtension等,并提供了相关的核心源码片段。适用平台包括Windows 2000/xp2003/vista以及CE/Mobile。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

               

基于Windows Sdk 与visual C++2008 在微软平台上构架自己的语音识别引擎(适用于windows 2000/xp2003/vista   windows CE /mobile),本项目开源,源码请留下你们的Email,我给大家发

 

本人闲来无事,自行开发了一个小型的语音识别引擎,搭建起在微软平台上的语音识别框架服务体系,

鉴于本人个人力量有限,为了将语音识别引擎做的功能更加强悍,强大,

现在将该系统开源,需要源码的请在本人优快云博客下留下EMail,

本系统属于系统框架,搭建起一个语音识别的引擎服务框架,

在微软平台上畅通无阻,

 

现在将本系统构架公布一下,

并贴出相关核心源码,源码体积为37M,编译后为3M,

适用于windows 2000/xp2003/vista   windows CE /mobile

 

框架头文件简介:

srengalt.h文件此文件包含的语音CSrEngineAlternates类。 这实现了接口ISpSRAlternates ,当一个应用程序GetAlternates或识别的结果, 将寻找AlternatesCLSID的识别引擎对象,并创建此对象。 并返回结果#pragma once#include "stdafx.h"#include "SampleSrEngine.h"#include "resource.h"class ATL_NO_VTABLE CSrEngineAlternates :  public CComObjectRootEx<CComMultiThreadModel>, public CComCoClass<CSrEngineAlternates, &CLSID_SampleSREngineAlternates>,    public ISpSRAlternates{public:    DECLARE_REGISTRY_RESOURCEID(IDR_SRENGALT)    DECLARE_PROTECT_FINAL_CONSTRUCT()    BEGIN_COM_MAP(CSrEngineAlternates)     COM_INTERFACE_ENTRY(ISpSRAlternates)    END_COM_MAP()public:    STDMETHODIMP GetAlternates(         SPPHRASEALTREQUEST *pAltRequest,        SPPHRASEALT **ppAlts,        ULONG *pcAlts);        STDMETHODIMP Commit(         SPPHRASEALTREQUEST *pAltRequest,        SPPHRASEALT *pAlt,        void **ppvResultExtra,        ULONG *pcbResultExtra);}; srengext.h 文件此文件包含CSampleSRExtension类。 这实现了自定义接口ISampleSRExtension  当一个应用程序开始识别, SAPI的将寻找ExtensionCLSID领域中的引擎对象的指针,并创建该对象,然后创建语音识别的要求。#pragma once#include "stdafx.h" #include "SampleSrEngine.h"#include "resource.h"class ATL_NO_VTABLE CSampleSRExtension : public CComObjectRootEx<CComMultiThreadModel>,public CComCoClass<CSampleSRExtension, &CLSID_SampleSRExtension>,public ISampleSRExtension,public ISpDisplayAlternates,public ISpEnginePronunciation{public:DECLARE_REGISTRY_RESOURCEID(IDR_SRENGEXT)DECLARE_GET_CONTROLLING_UNKNOWN()DECLARE_PROTECT_FINAL_CONSTRUCT()        BEGIN_COM_MAP(CSampleSRExtension)    COM_INTERFACE_ENTRY(ISampleSRExtension)    COM_INTERFACE_ENTRY(ISpDisplayAlternates)    COM_INTERFACE_ENTRY(ISpEnginePronunciation)END_COM_MAP()            HRESULT FinalConstruct()    {    / /失败CRecoExt作为一个非累计对象。          / /创建CRecoExt的SAPI        if(GetControllingUnknown() == dynamic_cast<ISampleSRExtension *>(this) )        {            return E_FAIL;        }       / /这个接口的处理的SAPI 5.1 。          / /必须QI'd中的FinalConstruct对象并没有释放。         return OuterQueryInterface(IID__ISpPrivateEngineCall, (void **)&m_pEngineCall);    }    void FinalRelease()    {  // 不释放IID__ISpPrivateEngineCall这里    }    STDMETHODIMP ExamplePrivateEngineCall(void); // 测试方法    // ISpDisplayAlternates 方法    STDMETHODIMP GetDisplayAlternates(        const SPDISPLAYPHRASE *pPhrase,         ULONG cRequestCount,         SPDISPLAYPHRASE **ppCoMemPhrases,        ULONG *pcPhrasesReturned);    STDMETHODIMP SetFullStopTrailSpace(ULONG ulTrailSpace);    // ISpEnginePronunciation 方法    STDMETHODIMP Normalize(         LPCWSTR pszWord,        LPCWSTR pszLeftContext,        LPCWSTR pszRightContext,        WORD LangID,        SPNORMALIZATIONLIST *pNormalizationList);            STDMETHODIMP GetPronunciations(         LPCWSTR pszWord,        LPCWSTR pszLeftContext,        LPCWSTR pszRightContext,        WORD LangID,        SPWORDPRONUNCIATIONLIST *pEnginePronunciationList);private:    _ISpPrivateEngineCall *m_pEngineCall;}; /*******************************************************************************   srengobj.h *此文件包含的宣言CSrEngine类。 *本实施ISpSREngine , ISpSREngine2和ISpObjectWithToken 。 *这是主要识别引擎对象******************************************************************************/#pragma once#include "stdafx.h"#include "SampleSrEngine.h"#include "resource.h"//语音识别对象。每个条目的列表中的一个实例这个类。class CContext{public:    CContext *   m_pNext;    BOOL operator==(SPRECOCONTEXTHANDLE hContext)    {        return (m_hSapiContext == hContext);    }    CContext(SPRECOCONTEXTHANDLE hSapiContext) :         m_hSapiContext(hSapiContext)    {}    SPRECOCONTEXTHANDLE m_hSapiContext; };//reco语法存储。每个条目的列表中的一个实例这个类。class CDrvGrammar{public:    CDrvGrammar *   m_pNext;    SPGRAMMARHANDLE m_hSapiGrammar; //  根据 SAPI  创建语法    BOOL            m_SLMLoaded;    // 语法是否与听写相关    BOOL            m_SLMActive;    // 词典是否被激活    WCHAR* m_pWordSequenceText;     // 词典词表放在缓冲区    ULONG m_cchText;                // 字序缓冲区大小    SPTEXTSELECTIONINFO* m_pInfo; // 文字选择字序缓冲区    CDrvGrammar(SPGRAMMARHANDLE hSapiGrammar) :         m_hSapiGrammar(hSapiGrammar),        m_SLMLoaded(FALSE),        m_SLMActive(FALSE),        m_pWordSequenceText(NULL),        m_cchText(0),        m_pInfo(NULL)    {    }    ~CDrvGrammar()    {   / /释放资源         / /对于每个语法对象将被释放SetWordSequenceData(NULL, 0, NULL). / / SetWordSequenceData和SetTextSelection将释放的内存          / /在这里没有必要释放内存的m_pWordSequenceText和m_pInfo .    }#ifdef _WIN32_WCE    CDrvGrammar()    {    }    static LONG Compare(const CDrvGrammar *, const CDrvGrammar *)    {        return 0;    }#endif};/ /读取的RecognizeStream线程中的音频数据块。每一组/ /决定如果数据讲话或沉默和价值补充说,此队列。 / /解码器读取这些线程和进程。 / /关键部分是用来使队列线程安全的,和一个事件是用来/ /显示如果缓冲区已空或没有。 / /这非常象roughtly模拟,这样做的特征提取/ /一个线程,并通过功能流的解码器。class CFrameQueue{public:    BOOL    m_aFrames[100]; // 语音识别返回值    ULONG   m_cFrames;    ULONG   m_ulHeadIndex;    HANDLE  m_hSpaceAvailEvent;    CRITICAL_SECTION m_cs;    CFrameQueue()    {        m_cFrames = 0;        m_ulHeadIndex = 0;        m_hSpaceAvailEvent = NULL;        InitializeCriticalSection(&m_cs);    }    ~CFrameQueue()    {        DeleteCriticalSection(&m_cs);    }    void SetSpaceAvailEvent(HANDLE h)    {        m_hSpaceAvailEvent = h;    }    void InsertTail(BOOL b)    {        EnterCriticalSection(&m_cs);        ULONG ulTailIndex = (m_ulHeadIndex + m_cFrames) % sp_countof(m_aFrames);        m_aFrames[ulTailIndex] = b;        m_cFrames++;        if (m_cFrames == sp_countof(m_aFrames))        {            ResetEvent(m_hSpaceAvailEvent);        }        LeaveCriticalSection(&m_cs);    }    BOOL IsFull()    {        EnterCriticalSection(&m_cs);        BOOL b = (m_cFrames == sp_countof(m_aFrames));        LeaveCriticalSection(&m_cs);        return b;    }    BOOL RemoveHead()    {        EnterCriticalSection(&m_cs);        BOOL b = m_aFrames[m_ulHeadIndex];        m_ulHeadIndex = (m_ulHeadIndex + 1) % sp_countof(m_aFrames);        m_cFrames--;        SetEvent(m_hSpaceAvailEvent);        LeaveCriticalSection(&m_cs);        return b;    }    BOOL HasData()    {        EnterCriticalSection(&m_cs);        ULONG cFrames = m_cFrames;        LeaveCriticalSection(&m_cs);        return cFrames;    }};//我们可以使用CSpBasicQueue信息存储规则class CRuleEntry{public:    BOOL operator==(SPRULEHANDLE rh)    {        return (m_hRule == rh);    }    CRuleEntry   * m_pNext;    SPRULEHANDLE m_hRule;   // SAPI 规则句柄    BOOL m_fTopLevel;       // 显示规则是否被激活    BOOL m_fActive;         // 显示识别引擎的设置}; // 语音识别类class ATL_NO_VTABLE CSrEngine :     public CComObjectRootEx<CComMultiThreadModel>,    public CComCoClass<CSrEngine, &CLSID_SampleSREngine>,    public ISpSREngine2,    public ISpObjectWithToken,    public ISpThreadTask{public:    CSrEngine() :        m_ulNextGrammarIndex(0),        m_cActive(0),        m_bPhraseStarted(FALSE),        m_bSoundStarted(FALSE),        m_hQueueHasRoom(NULL),        m_hRequestSync(NULL),        m_LangID(0)        {}DECLARE_REGISTRY_RESOURCEID(IDR_SRENG)DECLARE_PROTECT_FINAL_CONSTRUCT()BEGIN_COM_MAP(CSrEngine)    COM_INTERFACE_ENTRY(ISpSREngine)    COM_INTERFACE_ENTRY(ISpSREngine2)    COM_INTERFACE_ENTRY(ISpObjectWithToken)END_COM_MAP()private:    HANDLE                          m_hRequestSync;    CFrameQueue                     m_FrameQueue;    ULONG                           m_cBlahBlah;        CSpBasicQueue<CDrvGrammar>      m_GrammarList;    CSpBasicQueue<CContext>         m_ContextList;    ULONG                           m_ulNextGrammarIndex;    ULONG                           m_cActive;    ULONGLONG                       m_ullStart;    ULONGLONG                       m_ullEnd;    BOOL                            m_bSoundStarted:1;    BOOL       m_bPhraseStarted:1;    CComPtr<ISpSREngineSite>        m_cpSite;    CComPtr<ISpThreadControl>       m_cpDecoderThread;    HANDLE                          m_hQueueHasRoom;    CSpBasicQueue<CRuleEntry>       m_RuleList;    CComPtr<ISpLexicon>             m_cpLexicon;    CComPtr<ISpObjectToken>         m_cpEngineObjectToken;    CComPtr<ISpObjectToken>         m_cpUserObjectToken;    LANGID                          m_LangID;public:    HRESULT RandomlyWalkRule(SPRECORESULTINFO * pResult, ULONG nWords, ULONGLONG ullAudioPos, ULONG ulAudioSize);    HRESULT RecurseWalk(SPSTATEHANDLE hState, SPPATHENTRY * pPath, ULONG * pcTrans);    HRESULT WalkCFGRule(SPRECORESULTINFO * pResult, ULONG cRulesActive, BOOL fHypothesis,                        ULONG nWords, ULONGLONG ullAudioPos, ULONG ulAudioSize);    HRESULT WalkSLM(SPRECORESULTINFO * pResult, ULONG cSLMActive,                    ULONG nWords, ULONGLONG ullAudioPos, ULONG ulAudioSize);    HRESULT WalkTextBuffer(void* pvGrammarCookie, SPPATHENTRY * pPath, SPTRANSITIONID hId, ULONG * pcTrans);    HRESULT AddEvent(SPEVENTENUM eEvent, ULONGLONG ullStreamPos, WPARAM wParam = 0, LPARAM lParam = 0);    HRESULT AddEventString(SPEVENTENUM eEvent, ULONGLONG ulLStreamPos, const WCHAR * psz, WPARAM = 0);    HRESULT CreatePhraseFromRule( CRuleEntry * pRule, BOOL fHypothesis,                                  ULONGLONG ullAudioPos, ULONG ulAudioSize,                                  ISpPhraseBuilder** ppPhrase );    CRuleEntry* FindRule( ULONG ulRuleIndex );    CRuleEntry* NextRuleAlt( CRuleEntry * pPriRule, CRuleEntry * pLastRule );    void _CheckRecognition();    void _NotifyRecognition(BOOL fHypothesis, ULONG nWords);    HRESULT FinalConstruct();    HRESULT FinalRelease();    STDMETHODIMP SetObjectToken(ISpObjectToken * pToken);    STDMETHODIMP GetObjectToken(ISpObjectToken ** ppToken);    STDMETHODIMP SetRecoProfile(ISpObjectToken * pProfileToken);    STDMETHODIMP SetSite(ISpSREngineSite *pSite);    STDMETHODIMP GetInputAudioFormat(const GUID * pSrcFormatId, const WAVEFORMATEX * pSrcWFEX,                                     GUID * pDesiredFormatId, WAVEFORMATEX ** ppCoMemDesiredWFEX);    STDMETHODIMP OnCreateRecoContext(SPRECOCONTEXTHANDLE hSAPIRecoContext, void ** ppvDrvCtxt);    STDMETHODIMP OnDeleteRecoContext(void * pvDrvCtxt);    STDMETHODIMP OnCreateGrammar(void * pvEngineRecoContext,                                 SPGRAMMARHANDLE hSAPIGrammar,                                 void ** ppvEngineGrammar);    STDMETHODIMP OnDeleteGrammar(void * pvEngineGrammar);    STDMETHODIMP WordNotify(SPCFGNOTIFY Action, ULONG cWords, const SPWORDENTRY * pWords);    STDMETHODIMP RuleNotify(SPCFGNOTIFY Action, ULONG cRules, const SPRULEENTRY * pRules);    STDMETHODIMP LoadProprietaryGrammar(void * pvEngineGrammar,                                        REFGUID rguidParam,                                        const WCHAR * pszStringParam,                                        const void * pvDataParam,                                        ULONG ulDataSize,                                        SPLOADOPTIONS Options)    {        return E_NOTIMPL;    }    STDMETHODIMP UnloadProprietaryGrammar(void * pvEngineGrammar)    {        return E_NOTIMPL;    }    STDMETHODIMP SetProprietaryRuleState(void * pvEngineGrammar,                                     const WCHAR * pszName,                                    void * pvReserved,                                    SPRULESTATE NewState,                                    ULONG * pcRulesChanged)    {        return E_NOTIMPL;    }    STDMETHODIMP SetProprietaryRuleIdState(void * pvEngineGrammar,                                       DWORD dwRuleId,                                      SPRULESTATE NewState)    {        return E_NOTIMPL;    }     /由于这个引擎不支持专有的语法,我们并不需要执行     / /此方法不仅仅是返回S_OK 。注意执行不返回 E_NOTIMPL 。     / /仅仅返回S_OK ,并忽略这个数据如果您不需要它执行专有语法。    STDMETHODIMP SetGrammarState(void * pvEngineGrammar, SPGRAMMARSTATE eGrammarState)    {        return S_OK;    }    STDMETHODIMP SetContextState(void * pvEngineContxt, SPCONTEXTSTATE eCtxtState)    {        return S_OK;    }    // 字典方法    STDMETHODIMP LoadSLM(void * pvEngineGrammar, const WCHAR * pszTopicName);    STDMETHODIMP UnloadSLM(void * pvEngineGrammar);    STDMETHODIMP SetSLMState(void * pvEngineGrammar, SPRULESTATE NewState);    STDMETHODIMP IsPronounceable(void *pDrvGrammar, const WCHAR *pszWord, SPWORDPRONOUNCEABLE * pWordPronounceable);    STDMETHODIMP SetWordSequenceData(void * pvEngineGrammar, const WCHAR * pText, ULONG cchText, const SPTEXTSELECTIONINFO * pInfo);    STDMETHODIMP SetTextSelection(void * pvEngineGrammar, const SPTEXTSELECTIONINFO * pInfo);    STDMETHODIMP SetAdaptationData(void * pvEngineCtxtCookie, const WCHAR * pText, const ULONG cch);        STDMETHODIMP SetPropertyNum( SPPROPSRC eSrc, void* pvSrcObj, const WCHAR* pName, LONG lValue );    STDMETHODIMP GetPropertyNum( SPPROPSRC eSrc, void* pvSrcObj, const WCHAR* pName, LONG * plValue );    STDMETHODIMP SetPropertyString( SPPROPSRC eSrc, void* pvSrcObj, const WCHAR* pName, const WCHAR* pValue );    STDMETHODIMP GetPropertyString( SPPROPSRC eSrc, void* pvSrcObj, const WCHAR* pName, __deref_out_opt WCHAR** ppCoMemValue );    // 语音识别方法    STDMETHODIMP RecognizeStream(REFGUID rguidFmtId, const WAVEFORMATEX * pWaveFormatEx,                            HANDLE hRequestSync, HANDLE hDataAvailable,                            HANDLE hExit, BOOL fNewAudioStream, BOOL fRealTimeAudio,                            ISpObjectToken * pAudioObjectToken);    STDMETHODIMP PrivateCall(void * pvEngineContext, void * pCallFrame, ULONG ulCallFrameSize);    STDMETHODIMP PrivateCallEx(void * pvEngineContext, const void * pInCallFrame, ULONG ulCallFrameSize,                               void ** ppvCoMemResponse, ULONG * pcbResponse);    // 语音识别线程    STDMETHODIMP InitThread( void * pvTaskData, HWND hwnd )    {        return S_OK;    }    LRESULT STDMETHODCALLTYPE WindowMessage( void *pvTaskData, HWND hWnd, UINT Msg, WPARAM wParam, LPARAM lParam )    {        return E_UNEXPECTED;    }    STDMETHODIMP ThreadProc( void *pvTaskData, HANDLE hExitThreadEvent, HANDLE hNotifyEvent, HWND hwndWorker, volatile const BOOL * pfContinueProcessing );    // 语音引擎方法    STDMETHODIMP PrivateCallImmediate(             void *pvEngineContext,            const void *pInCallFrame,            ULONG ulInCallFrameSize,            void **ppvCoMemResponse,            ULONG *pulResponseSize);            STDMETHODIMP SetAdaptationData2(             void *pvEngineContext,            __in_ecount(cch)  const WCHAR *pAdaptationData,            const ULONG cch,            LPCWSTR pTopicName,            SPADAPTATIONSETTINGS eSettings,            SPADAPTATIONRELEVANCE eRelevance);            STDMETHODIMP SetGrammarPrefix(             void *pvEngineGrammar,            __in_opt  LPCWSTR pszPrefix,            BOOL fIsPrefixRequired);            STDMETHODIMP SetRulePriority(             SPRULEHANDLE hRule,            void *pvClientRuleContext,            int nRulePriority);            STDMETHODIMP EmulateRecognition(             ISpPhrase *pPhrase,            DWORD dwCompareFlags);            STDMETHODIMP SetSLMWeight(             void *pvEngineGrammar,            float flWeight);            STDMETHODIMP SetRuleWeight(             SPRULEHANDLE hRule,            void *pvClientRuleContext,            float flWeight);            STDMETHODIMP SetTrainingState(             BOOL fDoingTraining,            BOOL fAdaptFromTrainingData);            STDMETHODIMP ResetAcousticModelAdaptation( void);            STDMETHODIMP OnLoadCFG(             void *pvEngineGrammar,            const SPBINARYGRAMMAR *pGrammarData,            ULONG ulGrammarID);            STDMETHODIMP OnUnloadCFG(             void *pvEngineGramma

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值