/*-------------------------------------------------------------------------
*
* fd.c
* Virtual file descriptor code.
*
* Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/backend/storage/file/fd.c
*
* NOTES:
*
* This code manages a cache of 'virtual' file descriptors (VFDs).
* The server opens many file descriptors for a variety of reasons,
* including base tables, scratch files (e.g., sort and hash spool
* files), and random calls to C library routines like system(3); it
* is quite easy to exceed system limits on the number of open files a
* single process can have. (This is around 256 on many modern
* operating systems, but can be as low as 32 on others.)
*
* VFDs are managed as an LRU pool, with actual OS file descriptors
* being opened and closed as needed. Obviously, if a routine is
* opened using these interfaces, all subsequent operations must also
* be through these interfaces (the File type is not a real file
* descriptor).
*
* For this scheme to work, most (if not all) routines throughout the
* server should use these interfaces instead of calling the C library
* routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
* may find ourselves short of real file descriptors anyway.
*
* This file used to contain a bunch of stuff to support RAID levels 0
* (jbod), 1 (duplex) and 5 (xor parity). That stuff is all gone
* because the parallel query processing code that called it is all
* gone. If you really need it you could get it from the original
* POSTGRES source.
*-------------------------------------------------------------------------
*/
Fd.c 虚拟文件描述符代码
这段代码管理的是虚拟文件描述符的一段缓存,服务器因为各种原因打开许多的文件描述符,包括基本的表、临时文件(比如排序和哈希池文件)以及随机调用C语言库文件像system.因此一个进程打开的文件数很容易就超过系统的限制。(很多现代操作系统中大约是256个,也有的低至32个。)
LRU池管理VFDs,根据实际的需要打开和关闭操作系统描述符。很显然如果一个程序使用这些接口,所有的后继操作必须也通过这些接口。(该文件类型不是一个真正的文件描述符。)
基于这种工作机制,服务器中大多数程序应该使用这些接口而不是调用C语言库中的程序。否则我们也许会发现缺少实际描述符。
这个文件过去包含一堆的东西来支持RAID级别 0,1,5。现在并行查询处理代码已经没有了,因此相关的东西也就没有了。如果你确实需要它,就去postgresql原始的代码中获取。
* Private Routines
*
* Delete - delete a file from the Lru ring
* LruDelete - remove a file from the Lru ring and close its FD
* Insert - put a file at the front of the Lru ring
* LruInsert - put a file at the front of the Lru ring and open it
* ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
* AllocateVfd - grab a free (or new) file record (from VfdArray)
* FreeVfd - free a file record
*
* The Least Recently Used ring is a doubly linked list that begins and
* ends on element zero. Element zero is special -- it doesn't represent
* a file and its "fd" field always == VFD_CLOSED. Element zero is just an
* anchor that shows us the beginning/end of the ring.
* Only VFD elements that are currently really open (have an FD assigned) are
* in the Lru ring. Elements that are "virtually" open can be recognized
* by having a non-null fileName field.
1、VFD插入到LRU 中:
Vfd数据结构
typedef struct vfd
{
int fd; /* current FD, or VFD_CLOSED if none */
unsigned short fdstate; /* bitflags for VFD's state */
ResourceOwner resowner; /* owner, for automatic cleanup */
File nextFree; /* link to next free VFD, if in freelist */
File lruMoreRecently; /* doubly linked recency-of-use list */
File lruLessRecently;
off_t seekPos; /* current logical file position */
char *fileName; /* name of file, or NULL for unused VFD */
/* NB: fileName is malloc'd, and must be free'd when closing the VFD */
int fileFlags; /* open(2) flags for (re)opening the file */
int fileMode; /* mode to pass to open(2) */
} Vfd;
所有的系统文件描述符封装到vfd当中进行管理,vfd中第一个成员变量装载的就是实际的fd。进程在打开第一个文件的时候,声明并初始化一个数组Vfdcache[32],表示可以存放32个Vfd,同时给这32个Vfd分配内存空间,并将每一个Vfd中的fd字段置为VFD_CLOSED.这32个数组元素通过Vfd中成员nextFree链接成FreeList。
当需要打开一个文件的时候,就取出FreeList链表头元素,然后将该文件的文件描述符,文件名以及相关的标志信息填充到Vfd中。Postgresql 将所有的打开的文件的Vfd通过lruMoreRecently,lruLessRecently链接成一个双向链表。
根据文件名打开一个文件,分配一个Vfd并初始化该Vfd。
这里用到了strdup,表示用malloc分配一个内存空间,并且初始化内容为参数的内容,这段空间同样需要实用free进行释放,否则会造成内存泄漏。
/*
* open a file in an arbitrary directory
*
* NB: if the passed pathname is relative (which it usually is),
* it will be interpreted relative to the process' working directory
* (which should always be $PGDATA when this code is running).
*/
File
PathNameOpenFile(FileName fileName, int fileFlags, int fileMode)
{
char *fnamecopy;
File file;
Vfd *vfdP;
DO_DB(elog(LOG, "PathNameOpenFile: %s %x %o",
fileName, fileFlags, fileMode));
/*
* We need a malloc'd copy of the file name; fail cleanly if no room.
*/
fnamecopy = strdup(fileName);//复制文件名
if (fnamecopy == NULL)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
file = AllocateVfd();//分配一个Vfd
vfdP = &VfdCache[file];//指向file对应的Vfd的内存空间首地址
while (nfile + numAllocatedDescs >= max_safe_fds)//如果Vfd的数目已经达到了使用上限,则从LRU中释放最不常用的空间;
{
if (!ReleaseLruFile())
break;
}
vfdP->fd = BasicOpenFile(fileName, fileFlags, fileMode);//通过文件名获取操作系统提供的文件fd
if (vfdP->fd < 0)//如果fd不合法,则释放Vfd的空间和fnamecopy {
FreeVfd(file);free(fnamecopy);return -1;}++nfile;//打开的文件数加1DO_DB(elog(LOG, "PathNameOpenFile: success %d", vfdP->fd));Insert(file);//将该文件插入VfdCache中
vfdP->fileName = fnamecopy;/* Saved flags are adjusted to be OK for re-opening file */vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);vfdP->fileMode = fileMode;
vfdP->seekPos = 0;vfdP->fdstate = 0x0;
vfdP->resowner = NULL;return file;
}
void
InitFileAccess(void)
{
Assert(SizeVfdCache == 0); /* call me only once */
/* initialize cache header entry */
VfdCache = (Vfd *) malloc(sizeof(Vfd));
if (VfdCache == NULL)
ereport(FATAL,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));//初始化内存空间为0
VfdCache->fd = VFD_CLOSED;//初始化fd为VFD_CLOSED
SizeVfdCache = 1;//VfdCache的大小为1
/* register proc-exit hook to ensure temp files are dropped at exit */
on_proc_exit(AtProcExit_Files, 0);
}
InitFileAccess 主要功能是初始化VfdCache,分配一个Vfd的内存空间,并将其中所有的内存内容设置为0,VfdCache[0].fd设置为VFD_CLOSED。该Vfd不会分配给任何文件,主要是用做LRU池的访问头部。
虚拟文件描述符从0开始,第一次申请32个,紧接着申请的个数为上一次申请数量的两倍。编码为:0,1,2,3,4... ... 。
typedef int File;
static File
AllocateVfd(void)
{
Index i;
File file;
DO_DB(elog(LOG, "AllocateVfd. Size %lu", SizeVfdCache));
Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
if (VfdCache[0].nextFree == 0)
{
/*
* The free list is empty so it is time to increase the size of the
* array. We choose to double it each time this happens. However,
* there's not much point in starting *real* small.
*/
Size newCacheSize = SizeVfdCache * 2;
Vfd *newVfdCache;
if (newCacheSize < 32)
newCacheSize = 32;
/*
* Be careful not to clobber VfdCache ptr if realloc fails.
*/
newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
if (newVfdCache == NULL)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
VfdCache = newVfdCache;
/*
* Initialize the new entries and link them into the free list.
*/
for (i = SizeVfdCache; i < newCacheSize; i++)
{
MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
VfdCache[i].nextFree = i + 1;
VfdCache[i].fd = VFD_CLOSED;
}
VfdCache[newCacheSize - 1].nextFree = 0;
VfdCache[0].nextFree = SizeVfdCache;
/*
* Record the new size
*/
SizeVfdCache = newCacheSize;
}
file = VfdCache[0].nextFree;
VfdCache[0].nextFree = VfdCache[file].nextFree;
return file;
}
static void
Insert(File file)
{
Vfd *vfdP;//申明一个临时Vfd变量,
Assert(file != 0);//断言file是否为空
DO_DB(elog(LOG, "Insert %d (%s)", file, VfdCache[file].fileName));
DO_DB(_dump_lru());
vfdP = &VfdCache[file];
vfdP->lruMoreRecently = 0;
vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
VfdCache[0].lruLessRecently = file;
VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
DO_DB(_dump_lru());
}
/* returns 0 on success, -1 on re-open failure (with errno set) */
static int
LruInsert(File file)
{
Vfd *vfdP;
Assert(file != 0);
DO_DB(elog(LOG, "LruInsert %d (%s)", file, VfdCache[file].fileName));
vfdP = &VfdCache[file];
if (FileIsNotOpen(file))
{
while (nfile + numAllocatedDescs >= max_safe_fds)
{
if (!ReleaseLruFile())
break;
}
/*
* The open could still fail for lack of file descriptors, eg due to
* overall system file table being full. So, be prepared to release
* another FD if necessary...
*/
vfdP->fd = BasicOpenFile(vfdP->fileName, vfdP->fileFlags, vfdP->fileMode);
if (vfdP->fd < 0)
{
DO_DB(elog(LOG, "RE_OPEN FAILED: %d", errno));
return vfdP->fd;
}
else
{
DO_DB(elog(LOG, "RE_OPEN SUCCESS"));
++nfile;
}
/* seek to the right position */
if (vfdP->seekPos != (off_t) 0)
{
off_t returnValue;
returnValue = lseek(vfdP->fd, vfdP->seekPos, SEEK_SET);
Assert(returnValue != (off_t) -1);
}
}
/*
* put it at the head of the Lru ring
*/
Insert(file);
return 0;
}