上层ceph的机制,我这里就不详细解释了,当经过crush map计算后,落到对应的osd时,那么osd到底是如何将bufflist中的文件存储的呢?一言不合就上代码:)
在FileStore.cc中会有下面的代码,根据操作类型进行相关的逻辑:
//写操作
case Transaction::OP_WRITE:
{
coll_t cid = i.get_cid(op->cid);
ghobject_t oid = i.get_oid(op->oid);
_kludge_temp_object_collection(cid, oid);
uint64_t off = op->off;
uint64_t len = op->len;
uint32_t fadvise_flags = i.get_fadvise_flags();
bufferlist bl;
i.decode_bl(bl);
tracepoint(objectstore, write_enter, osr_name, off, len);
if (_check_replay_guard(cid, oid, spos) > 0)
r = _write(cid, oid, off, len, bl, fadvise_flags); ------------------------ 进入FileStore的_write函数
tracepoint(objectstore, write_exit, r);
}
break;
int FileStore::_write(const coll_t& cid, const ghobject_t& oid,
uint64_t offset, size_t len,
const bufferlist& bl, uint32_t fadvise_flags)
{
dout(15) << "write " << cid << "/" << oid << " " << offset << "~" << len << dendl;
int r;
FDRef fd;
r = lfn_open(cid, oid, true, &fd); -------打开文件
if (r < 0) {
dout(0) << "write couldn't open " << cid << "/"
<< oid << ": "
<< cpp_strerror(r) << dendl;
goto out;
}
// write
r = bl.write_fd(**fd, offset); ---------写入文件,此处就调用了bufferlist的write_fd方法
if (r == 0)
r = bl.length();
if (r >= 0 && m_filestore_sloppy_crc) {
int rc = backend->_crc_update_write(**fd, offset, len, bl);
assert(rc >= 0);
}
if (replaying || m_disable_wbthrottle) {
if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED) {
posix_fadvise(**fd, 0, 0, POSIX_FADV_DONTNEED);
}
} else {
wbthrottle.queue_wb(fd, oid, offset, len,
fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED);
}
lfn_close(fd);
out:
dout(10) << "write " << cid << "/" << oid << " " << offset << "~" << len << " = " << r << dendl;
return r;
}
int buffer::list::write_fd(int fd, uint64_t offset) const
{
iovec iov[IOV_MAX];
std::list<ptr>::const_iterator p = _buffers.begin();
uint64_t left_pbrs = _buffers.size();
while (left_pbrs) {---------------------------------------循环读文件到一个iov
ssize_t bytes = 0;
unsigned iovlen = 0;
uint64_t size = MIN(left_pbrs, IOV_MAX);
left_pbrs -= size;
while (size > 0) {
iov[iovlen].iov_base = (void *)p->c_str();
iov[iovlen].iov_len = p->length();
iovlen++;
bytes += p->length();
++p;
size--;
}
int r = do_writev(fd, iov, offset, iovlen, bytes); ---------继续往下看
if (r < 0)
return r;
offset += bytes;
}
return 0;
}
static int do_writev(int fd, struct iovec *vec, uint64_t offset, unsigned veclen, unsigned bytes)
{
ssize_t r = 0;
while (bytes > 0) {
#ifdef HAVE_PWRITEV
r = ::pwritev(fd, vec, veclen, offset);
#else
r = ::lseek64(fd, offset, SEEK_SET);-----------设置文件偏移量,接着之前的写。
if (r != offset) {
r = -errno;
return r;
}
r = ::writev(fd, vec, veclen);---------------真正的写API,该函数可以自行百度查询,这里就不多BB了,下面付了写介绍。
#endif
if (r < 0) {
if (errno == EINTR)
continue;
return -errno;
}
bytes -= r;
offset += r;
if (bytes == 0) break;
while (r > 0) {
if (vec[0].iov_len <= (size_t)r) {
// drain this whole item
r -= vec[0].iov_len;
++vec;
--veclen;
} else {
vec[0].iov_base = (char *)vec[0].iov_base + r;
vec[0].iov_len -= r;
break;
}
}
}
return 0;
}
简介
writev将多个数据存储在一起,将驻留在两个或更多的不连接的缓冲区中的数据一次写出去。
UNIX和WINSOCK提供了不同的实现方法UNIX系统下,使用writev,可以指定一系列的缓冲区,收集要写的数据,使可以安排数据保存在多个缓冲区中,然后同时写出去,从而避免出现Nagle和延迟ACK算法的相互影响。
参数
#include <sys/uio.h>
ssize_t writev( int fd, const struct iovec *iov, int cnt );
ssize_t readv( int fd, const struct iovec *iov, int cnt );
返回值:传输字节数,出错时返回-1.
参数说明:
iov是一组iovec结构的指针,iovec结构如下:
struct iovec {
char *iov_base; /*基本地址指针,指向缓冲区*/
size_t iov_len; /*指定缓冲区长度*/
};
说明:这个定义取自FreeBSD系统,许多系统现在定义基本地址指针为void *iov_base;
cnt是数组中iovec结构的个数,即分开缓冲区的个数。
示例:
#include <sys/uio.h>
int main( int argc, char **argv )
{
SOCKET s;
int n;
char buf[ 128 ];
struct iovec iov[ 2 ];
INIT();
/*socket部分略去*/
/*writev调用指定iov参数指向的结构为const变量,即iov数组不会被writev调用改变,程序可以在循环外设置结构的大多数域*/
iov[ 0 ].iov_base = ( char * )&n;
iov[ 0 ].iov_len = sizeof( n );
iov[ 1 ].iov_base = buf;
while ( fgets( buf, sizeof( buf ), stdin ) != NULL )
{
iov[ 1 ].iov_len = strlen( buf );
n = htonl( iov[ 1 ].iov_len );
if ( writev( s, iov, 2 ) < 0 )
error( 1, errno, "writev failure" );
}
EXIT( 0 );
}
程序说明,用第二项读取输入的数据,第一行记录读取数据的长度,并将其转换为网络字节序,将这两项同时通过套接字发送至对等方。
Winsock中类似函数
#include <winsock2.h>
int WSAAPI WSAsend( SOCKET s, LPWSABUF buf, DWORD cnt, LPDWORD sent, DWORD flags, LPWSAOVERLAPPED ov1, LPWSAOVERLAPPED_COMPLETION_ROUTINE func );
返回值:成功返回0,否则返回 SOCKET_ERROR
最后两个参数用于重叠I/O,buf指向WSABUF数据结构,作用和writev中的iovec结构相似
typedef struct _WSAVUF {
u_long len; /*buffer 长度*/
char FAR* buf; /*指向buffer的指针*/
} WSABUF, FAR* LPWSABUF;
如果调用成功返回,参数sent就指向所包含发送字节数目的一个DWORD变量。