背景
我们在发布程序后,需要对程序进行维护。一旦遇到问题需要及时定位问题并解决。这篇文章主要是记录如何在程序中加入必要的检测工具来帮助我们尽快的定位问题的方法。
程序捕捉crash的方法
开源程序
程序运行难免会出现crash的情况。在用户现场发送时,如果需要用户介入进行数据收集,就会严重降低用户体验。为了将crash的采集、上送都进行自动化完成,就需要我们在程序中完成此步骤。
恰好存在一个开源的软件crashrpt(http://crashrpt.sourceforge.net/),可以用来采集和上送crash信息。
导入说明
crashrpt比较简单,根据说明既可以实现。crashrpt包括采集、上送和分析三个模块。我们可以根据需要来使用。
目前我只用过采集功能,其余两个还没有使用过。这里先上代码(crashrpt的demo)
#include <windows.h>
#include <stdio.h>
#include <tchar.h>
// Include CrashRpt Header
#include "CrashRpt.h"
// Define the callback function that will be called on crash
int CALLBACK CrashCallback(CR_CRASH_CALLBACK_INFO* pInfo)
{
// The application has crashed!
// Return CR_CB_DODEFAULT to generate error report
return CR_CB_DODEFAULT;
}
// Thread procedure
DWORD WINAPI ThreadProc(LPVOID lpParam)
{
// Define the infinite loop where some processing will be done
for(;;)
{
// There is a hidden error somewhere inside of the loop...
int* p = NULL;
*p = 13; // This results in Access Violation
}
return 0;
}
int _tmain(int argc, _TCHAR* argv[])
{
// Define CrashRpt configuration parameters
CR_INSTALL_INFO info;
memset(&info, 0, sizeof(CR_INSTALL_INFO));
info.cb = sizeof(CR_INSTALL_INFO);
info.pszAppName = _T("MyApp");
info.pszAppVersion = _T("1.0.0");
info.pszEmailSubject = _T("MyApp 1.0.0 Error Report");
info.pszEmailTo = _T("myapp_support@hotmail.com");
info.pszUrl = _T("http://myapp.com/tools/crashrpt.php");
info.uPriorities[CR_HTTP] = 3; // First try send report over HTTP
info.uPriorities[CR_SMTP] = 2; // Second try send report over SMTP
info.uPriorities[CR_SMAPI] = 1; // Third try send report over Simple MAPI
// Install all available exception handlers
info.dwFlags |= CR_INST_ALL_POSSIBLE_HANDLERS;
// Restart the app on crash
info.dwFlags |= CR_INST_APP_RESTART;
info.dwFlags |= CR_INST_SEND_QUEUED_REPORTS;
info.pszRestartCmdLine = _T("/restart");
// Define the Privacy Policy URL
info.pszPrivacyPolicyURL = _T("http://myapp.com/privacypolicy.html");
// Install crash reporting
int nResult = crInstall(&info);
if(nResult!=0)
{
// Something goes wrong. Get error message.
TCHAR szErrorMsg[512] = _T("");
crGetLastErrorMsg(szErrorMsg, 512);
_tprintf_s(_T("%s\n"), szErrorMsg);
return 1;
}
// Set crash callback function
crSetCrashCallback(CrashCallback, NULL);
// The main code follows...
// Create the worker thread
HANDLE hWorkingThread = CreateThread(NULL, 0,
ThreadProc, (LPVOID)NULL, 0, NULL);
// There is a hidden error in the main() function
// Call of _tprintf_s with NULL parameter
TCHAR* szFormatString = NULL;
_tprintf_s(szFormatString);
// Wait until the worker thread is exited
WaitForSingleObject(hWorkingThread, INFINITE);
// Uninitialize CrashRpt before exiting the main function
crUninstall();
// Exit
return 0;
}
crashrpt的代码设置项更多,我们只需要关注两个函数crInstall 和crSetCrashCallback 即可监控所有的线程,一旦发生crash就会进入CrashCallback回调函数。我们可以在这个函数中采集必要的信息。
这里提供一种可以直接读取stack数据的方法:
std::string GetStackInfo(PEXCEPTION_POINTERS pExceptionPointers)
{
if (pExceptionPointers != nullptr && pExceptionPointers->ContextRecord != nullptr)
{
DWORD m_dwMachineType = 0;
size_t Count = 256;
TCHAR wszProcessor[256] = { 0 };
::_tgetenv_s(&Count, wszProcessor, _T("PROCESSOR_ARCHITECTURE"));
if (wszProcessor)
{
if ((!wcscmp(_T("EM64T"), wszProcessor)) || !wcscmp(_T("AMD64"), wszProcessor))
{
m_dwMachineType = IMAGE_FILE_MACHINE_AMD64;
}
else if (!wcscmp(_T("x86"), wszProcessor))
{
m_dwMachineType = IMAGE_FILE_MACHINE_I386;
}
}
// Initialize stack frame
STACKFRAME64 sf;
memset(&sf, 0, sizeof(STACKFRAME));
#if defined(_WIN64)
sf.AddrPC.Offset = pExceptionPointers->ContextRecord->Rip;
sf.AddrStack.Offset = pExceptionPointers->ContextRecord->Rsp;
sf.AddrFrame.Offset = pExceptionPointers->ContextRecord->Rbp;
#elif defined(WIN32)
sf.AddrPC.Offset = pExceptionPointers->ContextRecord->Eip;
sf.AddrStack.Offset = pExceptionPointers->ContextRecord->Esp;
sf.AddrFrame.Offset = pExceptionPointers->ContextRecord->Ebp;
#endif
sf.AddrPC.Mode = AddrModeFlat;
sf.AddrStack.Mode = AddrModeFlat;
sf.AddrFrame.Mode = AddrModeFlat;
if (0 == m_dwMachineType)
return "";
// Walk through the stack frames.
HANDLE hProcess = GetCurrentProcess();
SymInitialize(hProcess, "", TRUE);
HANDLE hThread = GetCurrentThread();
std::vector<FunctionCall> callStack;
while (StackWalk64(m_dwMachineType, hProcess, hThread, &sf, pExceptionPointers->ContextRecord, 0, SymFunctionTableAccess64, SymGetModuleBase64, 0))
{
if (sf.AddrFrame.Offset == 0 || callStack.size() >= 24)
break;
// 1. Get function name at the address
const int nBuffSize = (sizeof(SYMBOL_INFO) + MAX_SYM_NAME * sizeof(TCHAR) + sizeof(ULONG64) - 1) / sizeof(ULONG64);
ULONG64 symbolBuffer[nBuffSize];
PSYMBOL_INFO pSymbol = (PSYMBOL_INFO)symbolBuffer;
pSymbol->SizeOfStruct = sizeof(SYMBOL_INFO);
pSymbol->MaxNameLen = MAX_SYM_NAME;
FunctionCall curCall;
curCall.Address = sf.AddrPC.Offset;
DWORD64 moduleBase = SymGetModuleBase64(hProcess, sf.AddrPC.Offset);
char ModuleName[MAX_PATH];
if (moduleBase && GetModuleFileNameA((HINSTANCE)moduleBase, ModuleName, MAX_PATH))
{
curCall.ModuleName = FunctionCall::GetFileName(ModuleName);
}
else
continue;
DWORD64 dwSymDisplacement = 0;
if (SymFromAddr(hProcess, sf.AddrPC.Offset, &dwSymDisplacement, pSymbol))
{
curCall.FunctionName = std::string(pSymbol->Name);
}
else
continue;
IMAGEHLP_LINE64 lineInfo = { sizeof(IMAGEHLP_LINE64) };
DWORD dwLineDisplacement = 0;
if (SymGetLineFromAddr64(hProcess, sf.AddrPC.Offset, &dwLineDisplacement, &lineInfo))
{
curCall.FileName = FunctionCall::GetFileName(std::string(lineInfo.FileName));
curCall.LineNumber = lineInfo.LineNumber;
}
callStack.push_back(curCall);
}
SymCleanup(hProcess);
std::string strErrorLogInfo;
for (auto iter : callStack)
{
strErrorLogInfo = StringUtil::Format("%s at %s %s::%s %d \r\n",
strErrorLogInfo.c_str(),
iter.ModuleName.c_str(),
iter.FileName.c_str(),
iter.FunctionName.c_str(),
iter.LineNumber);
}
return strErrorLogInfo;
}
return "";
}
其中pExceptionPointers来源与回调参数pInfo->pExceptionInfo->pexcptrs.
注意:想要使用上面的还是,则程序必须要是要vs2017以上的IDE,且选择/DEBUG选项(生成pdb,但是产品中不带有pdb),这样才会使stack的数据带有符号信息,才会对开发人员有排查的价值。
程序捕捉泄漏的方法
内存泄漏更难定位,且需要一定的时间积累才会显现。所以需要借助开源库从开始运行时就开始检测。
开源程序
VLD (https://kinddragon.github.io/vld/)主要是在在程序中集成vld的sdk,然后来进行长时间的检测。
导入说明
这里就引用已有的博客(https://blog.youkuaiyun.com/lonely1047/article/details/120038929)