内核回调之一"读"到底

最新推荐文章于 2025-09-27 16:32:56 发布

转载最新推荐文章于 2025-09-27 16:32:56 发布 · 1.1k 阅读

linux 专栏收录该内容

97 篇文章

订阅专栏

本文详细跟踪了Linux内核中read函数的执行流程，从用户空间的read调用开始，深入到文件系统层（VFS）、YAFFS文件系统层，直至硬件层的读取操作。解析了内核服务例程、文件操作结构体、超级块、内存映射、页面缓存等关键组件的作用，揭示了内核如何高效地进行文件读取操作。

http://www.th7.cn/system/lin/2011/11/23/20649.shtml

内核回调之一"读"到底

2011-11-23 17:33:58 cnblogs.com- 郝一二三- 点击数:81

id="cproIframe_u848856" width="336" height="280" src="http://pos.baidu.com/acom?adn=3&at=103&aurl=&cad=1&ccd=24&cec=UTF-8&cfv=11&ch=0&col=zh-CN&conOP=0&cpa=1&dai=2&dis=0&ltr=https%3A%2F%2Fwww.google.com.hk%2F&ltu=http%3A%2F%2Fwww.th7.cn%2Fsystem%2Flin%2F2011%2F11%2F23%2F20649.shtml&lunum=6&n=smileking_cpr&pcs=1440x783&pis=10000x10000&ps=350x280&psr=1440x900&pss=1440x351&qn=9338707924b205d3&rad=&rsi0=336&rsi1=280&rsi5=4&rss0=%23F5F8FD&rss1=%23F5F8FD&rss2=%23000000&rss3=%23444444&rss4=%23444444&rss5=&rss6=%23444444&rss7=&scale=&skin=&td_id=848856&tn=text_default_336_280&tpr=1415080749332&ts=1&xuanting=0&dtm=BAIDU_DUP2_SETJSONADSLOT&dc=2&di=u848856" align="center,center" marginwidth="0" marginheight="0" scrolling="no" frameborder="0" allowtransparency="true">

小小地跟踪下read函数，从 ssize_t read(int fd, void *buf, size_t count) 到DATASHEET 一调到底，见证内核的分层模块化。

--内核服务例程开始提供服务--

--fs/read_write.c--

SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
    struct file *file;
    ssize_t ret = -EBADF;
    int fput_needed;

    file = fget_light(fd, &fput_needed);
    if (file) {
        loff_t pos = file_pos_read(file);
        ret = vfs_read(file, buf, count, &pos);        //-->
        file_pos_write(file, pos);
        fput_light(file, fput_needed);
    }   

    return ret;
}

--进入vfs层--

ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{ 
    ... ...
    if (ret >= 0) {
        count = ret;
        if (file->f_op->read)
            ret = file->f_op->read(file, buf, count, pos);    //-->
    ... ...

    return ret;
}

发现回调函数，file->f_op->read

static const struct file_operations yaffs_file_operations = {    .read 	= do_sync_read,	//-->    .write 	= do_sync_write,    .aio_read 	= generic_file_aio_read,    .aio_write 	= generic_file_aio_write,    .mmap 	= generic_file_mmap,    .flush 	= yaffs_file_flush,    .fsync 	= yaffs_sync_object,    .sendfile 	= generic_file_sendfile,}

又是何时给回调函数挂上的钩子？

file = fget_light(fd, &fput_needed)

通过fd得出file。是谁将fd与file有了联系，是read之前的open。

--fs/open.c--

 669 static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
 670                     struct file *f,
 671                     int (*open)(struct inode *, struct file *),
 672                     const struct cred *cred)
 673 {

 684     inode = dentry->d_inode;

 704     f->f_op = fops_get(inode->i_fop);    //-->

 734     return f;
 735 }

我们发现：f->f_op等同了inode->i_fop。

--include/linux/fs.h--

1862 /* Alas, no aliases. Too much hassle with bringing module.h everywhere */
1863 #define fops_get(fops) /
1864     (((fops) && try_module_get((fops)->owner) ? (fops) : NULL))
1865 #define fops_put(fops) /
1866     do { if (fops) module_put((fops)->owner); } while(0)

继续追踪inode，发现这么一个函数：yaffs_fill_inode_from_obj 。

inode->i_fop在这里赋值。函数名说的很清楚，通过yaffs_obj结构体来填充inode。填充inode就是该函数的使命。

--fs/yaffs2/yaffs_vfs.c--

1273 static void yaffs_fill_inode_from_obj(struct inode *inode,
1274                       struct yaffs_obj *obj)

1354         case S_IFREG:   /* file */
1355             inode->i_op             = &yaffs_file_inode_operations;
1356             inode->i_fop            = &yaffs_file_operations;
1357             inode->i_mapping->a_ops =
1358                 &yaffs_file_address_operations;
1359             break;

1378 }

看来这里还给inode->i_op 和 inode->i_mapping->a_ops 挂上了钩子，以后会用到。

紧接之前回调到do_sync_read。

ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
    ... ...

    for (;;) { 
        ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);    -->
        if (ret != -EIOCBRETRY)
            break;
        wait_on_retry_sync_kiocb(&kiocb);
    }
    ... ...
}

这里又转向了aio_read? 是异步读取的意思。原有的read为同步读取，异步读写后被patch上。

From: Marco Stornelli <marco.stornelli@gmail.com>If a filesystem in the file operations specifies for read and write operations only do_sync_read and do_sync_write without init aio_read and aio_write, there will be a kernel oops, because the vfs code check the presence of (to read for example)read OR aio_read method, then it calls read if it's pointer is not null. It's not sufficient because if the read function is actually a do_sync_read, it calls aio_read but without checking the presence. I think a BUG_ON check can be more useful.Signed-off-by: Marco Stornelli <marco.stornelli@gmail.com>

回调到generic_file_aio_read。

1292 ssize_t
1293 generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1294         unsigned long nr_segs, loff_t pos)
1295 {

1369         do_generic_file_read(filp, ppos, &desc, file_read_actor);    -->

1380 }

开始真正的读操作。
不管是读还是写，都会优先考虑cache。若无法命中，再进行传统意义上的actual read。

975 /**
 976  * do_generic_file_read - generic file read routine
 977  * @filp:   the file to read
 978  * @ppos:   current file position
 979  * @desc:   read_descriptor
 980  * @actor:  read method
 981  *
 982  * This is a generic file read routine, and uses the
 983  * mapping->a_ops->readpage() function for the actual low-level stuff.
 984  *
 985  * This is really ugly. But the goto's actually try to clarify some
 986  * of the logic when it comes to error handling etc.
 987  */
 988 static void do_generic_file_read(struct file *filp, loff_t *ppos,
 989         read_descriptor_t *desc, read_actor_t actor)
 990 {
 991     struct address_space *mapping = filp->f_mapping;

 


1134         /* Start the actual read. The read will unlock the page. */
1135         error = mapping->a_ops->readpage(filp, page); 　　//假设cache没有命中，就继续向下读，没有命中真是一种糟糕的情况

   

1203 }

又见回调。

static struct address_space_operations yaffs_file_address_operations = {
    .readpage  = yaffs_readpage,　　//-->
    .writepage = yaffs_writepage,
#if (YAFFS_USE_WRITE_BEGIN_END > 0)
    .write_begin = yaffs_write_begin,
    .write_end   = yaffs_write_end,
#else
    .prepare_write = yaffs_prepare_write,
    .commit_write  = yaffs_commit_write,
#endif
};

struct address_space_operations 为struct address_space的操作函数。

struct address_space 用于管理文件（struct inode)映射到内存的页面(struct page)；

与之对应，address_space_operations 就是用来操作该文件映射到内存的页面，比如把内存中的修改写回文件、从文件中读入数据到页面缓冲等。

也就是说address_space结构与文件的对应：一个具体的文件在打开后，内核会在内存中为之建立一个struct inode结构，其中的i_mapping域指向一个address_space结构。这样，一个文件就对应一个address_space结构，一个 address_space与一个偏移量能够确定一个page cache 或swap cache中的一个页面。因此，当要寻址某个数据时，很容易根据给定的文件及数据在文件内的偏移量而找到相应的页面。

-- fs/yaffs2/yaffs_vfs.c --

static int yaffs_readpage(struct file *f, struct page *pg) 
{
    int ret; 

    yaffs_trace(YAFFS_TRACE_OS, "yaffs_readpage");
    ret = yaffs_readpage_unlock(f, pg);     //-->
    yaffs_trace(YAFFS_TRACE_OS, "yaffs_readpage done");
    return ret; 
}

看样子要读page：

static int yaffs_readpage_unlock(struct file *f, struct page *pg)
{
    int ret = yaffs_readpage_nolock(f, pg);    //-->
    UnlockPage(pg);
    return ret;
}

这里出现了file指针，我们知道file是个vfs逻辑上的概念，可能多个file对应一个inode。看来逻辑file就快转化为相应的文件物理地址。

static int yaffs_readpage_nolock(struct file *f, struct page *pg)
{
    ... ...
    /* FIXME: Can kmap fail? */
    pg_buf = kmap(pg);

/********************************

void *kmap(struct page *page)
{
    might_sleep();
    if (!PageHighMem(page))
        return page_address(page);
    return kmap_high(page);，    //将高端内存页映射到内核地址空间，返回映射的虚拟地址
}

 ********************************/

    yaffs_gross_lock(dev);

    ret = yaffs_file_rd(obj, pg_buf,
                pg->index << PAGE_CACHE_SHIFT, PAGE_CACHE_SIZE);    //-->
                        
    yaffs_gross_unlock(dev);

    ... ...
}

--文件系统：yaffs--

　　终于进入了yaffs_guts.c文件，该文件涉及到了yaffs的操作细节。

-- fs/yaffs2/yaffs_guts.c --

int yaffs_file_rd(struct yaffs_obj *in, u8 * buffer, loff_t offset, int n_bytes)
{
    ... ...

    int n = n_bytes;    //page size: 4k(arm)

    while (n > 0) {
        yaffs_addr_to_chunk(dev, offset, &chunk, &start);    
                                        //chunk = (u32) (addr >> dev->chunk_shift);
        chunk++;

        /* OK now check for the curveball where the start and end are in
         * the same chunk.
         */
        if ((start + n) < dev->data_bytes_per_chunk)
            n_copy = n;
        else
            n_copy = dev->data_bytes_per_chunk - start;

        cache = yaffs_find_chunk_cache(in, chunk);

        /* If the chunk is already in the cache or it is less than
         * a whole chunk or we're using inband tags then use the cache
         * (if there is caching) else bypass the cache.
         */
        if (cache || n_copy != dev->data_bytes_per_chunk ||
            dev->param.inband_tags) {

        　　 /*先考虑是否在高缓中*/

        } else {
            /* A full chunk. Read directly into the buffer. */
            yaffs_rd_data_obj(in, chunk, buffer);        //-->
        }
        n -= n_copy;
        offset += n_copy;
        buffer += n_copy;
        n_done += n_copy;
    }
    return n_done;
}

-->

/*inode_chunk：yaffs中的chunk下标*/
static int yaffs_rd_data_obj(struct yaffs_obj *in, int inode_chunk, u8 * buffer)
{
    int nand_chunk = yaffs_find_chunk_in_file(in, inode_chunk, NULL);

    if (nand_chunk >= 0)
        return yaffs_rd_chunk_tags_nand(in->my_dev, nand_chunk, buffer, NULL);    //-->
    else {
    ... ...
    }
}

有必要了解下：
    int nand_chunk = yaffs_find_chunk_in_file(in, inode_chunk, NULL);

   第二个参数inode_chunk是逻辑地址，在这里chunk是以512字节为单位排序后的下标。
   YAFFS2文件系统使用树结点结构来完成逻辑chunk地址与物理地址的映射。显然，经过此函数的处理，找到物理下标对应的逻辑下标。树结点用Tnode表示。

关于Tnode，涉及读yaffs的细节的理解，总之，yaffs作为文件系统就要管理物理页面，物理页面对应着逻辑chunk，出于文件寻找，文件扩大等效率方面的考虑，采用了数据结构——树。具体可参考有关yaffs的论文。

-->

--fs/yaffs2/yaffs_nand.c--

int yaffs_rd_chunk_tags_nand(struct yaffs_dev *dev, int nand_chunk,
                 u8 *buffer, struct yaffs_ext_tags *tags)
{
    ... ...

    if (dev->param.read_chunk_tags_fn)
        result =
            dev->param.read_chunk_tags_fn(dev, realigned_chunk, buffer, tags);    //-->
    ... ...
}

一个看似不一样的回调，赋值在这里：

   --yaffs_vfs.c--static struct super_block *yaffs_internal_read_super(int yaffs_version,                             struct super_block *sb,                             void *data, int silent){        param->read_chunk_tags_fn = nandmtd2_read_chunk_tags;}

进入yaffs_mtdif2.c文件，看来要到mtd层咯。

--fs/yaffs2/yaffs_mtdif2.c--

int nandmtd2_read_chunk_tags(struct yaffs_dev *dev, int nand_chunk,
                 u8 *data, struct yaffs_ext_tags *tags)
{
    ... ...

    if (dev->param.inband_tags || (data && !tags))
        retval = mtd->read(mtd, addr, dev->param.total_bytes_per_chunk,
                   &dummy, data);    //-->进入mtd层
    else if (tags) {
        ops.mode = MTD_OOB_AUTO;
        ops.ooblen = packed_tags_size;
        ops.len = data ? dev->data_bytes_per_chunk : packed_tags_size;
        ops.ooboffs = 0;
        ops.datbuf = data;
        ops.oobbuf = yaffs_dev_to_lc(dev)->spare_buffer;
        retval = mtd->read_oob(mtd, addr, &ops);
    }

    ... ...
}

在进入mtd层之前，简单的提下yaffs中的各种回调是在何时挂好的呢？yaffs又是如何注册进的内核？

--------------------
注册文件系统：
--------------------

--fs/yaffs2/yaffs_vfs.c--

static int __init init_yaffs_fs(void)
{
    int error = 0;
    struct file_system_to_install *fsinst;

    ... ...

    fsinst = fs_to_install;    //创建文件系统进程入口

/***********************************************************

    static struct file_system_to_install fs_to_install[] = {
        {&yaffs_fs_type, 0},
        {&yaffs2_fs_type, 0},
        {NULL, 0}
    };

 ************************************************************/


    while (fsinst->fst && !error) {
        error = register_filesystem(fsinst->fst);    //注册文件系统
        if (!error)
            fsinst->installed = 1;
        fsinst++;
    }

    ... ...

    return error;
}

注册文件系统后，yaffs2挂上fs list，开始超级块操作：

static struct file_system_type yaffs2_fs_type = {
    .owner    = THIS_MODULE,
    .name     = "yaffs2",
    .get_sb   = yaffs2_read_super,    //-->
    .kill_sb  = kill_block_super,
    .fs_flags = FS_REQUIRES_DEV,
};

读取超级块。

static int yaffs2_read_super(struct file_system_type *fs,
                             int flags,   const char *dev_name,   void *data,
                             struct vfsmount *mnt)
{   
    return get_sb_bdev(fs, flags, dev_name, data,
               yaffs2_internal_read_super_mtd, mnt);    //-->
}

----> mount_bdev函数中调用 fill_super, 也就是 yaffs2_internal_read_super_mtd

int get_sb_bdev(struct file_system_type *fs_type,
                int flags, const char *dev_name, void *data,
                int (*fill_super)(struct super_block *, void *, int),
                struct vfsmount *mnt)
{                        
    struct dentry *root; 

    //获得了超级块指针，如果s->s_root，也就是目录挂载点为空的化，那就要填充超级块
    root = mount_bdev(fs_type, flags, dev_name, data, fill_super);　　//-->
    if (IS_ERR(root))    
        return PTR_ERR(root);
    mnt->mnt_root = root;
    mnt->mnt_sb = root->d_sb;
    return 0;  
}

----> fill_super，填充超级块

static int yaffs2_read_super(struct file_system_type *fs,
                             int flags,   const char *dev_name,   void *data,
                             struct vfsmount *mnt)
{
    return get_sb_bdev(fs, flags, dev_name, data,
               yaffs2_internal_read_super_mtd, mnt);    //-->
}

---->

static int yaffs2_internal_read_super_mtd(struct super_block *sb, void *data,
                                          int silent)
{
    return yaffs_internal_read_super(2, sb, data, silent) ? 0 : -EINVAL;
}

----> 获取super block：kmalloc出空间，而后填充，返回。

static struct super_block *yaffs_internal_read_super(int yaffs_version,
                                                     struct super_block *sb,
                                                     void *data, int silent)
{
    int n_blocks;
    struct inode *inode = NULL;
    struct dentry *root;
    struct yaffs_dev *dev = 0;
    char devname_buf[BDEVNAME_SIZE + 1];
    struct mtd_info *mtd;
    int err;
    char *data_str = (char *)data;
    struct yaffs_linux_context *context = NULL;
    struct yaffs_param *param;

    int read_only = 0;

    struct yaffs_options options;

    unsigned mount_id;
    int found;
    struct yaffs_linux_context *context_iterator;
    struct list_head *l;

    if (!sb) {
        printk(KERN_INFO "yaffs: sb is NULL/n");
        return NULL;
    }

    sb->s_magic = YAFFS_MAGIC;
    sb->s_op = &yaffs_super_ops;    //super_block层的操作函数
    sb->s_flags |= MS_NOATIME;

    read_only = ((sb->s_flags & MS_RDONLY) != 0);

#ifdef YAFFS_COMPILE_EXPORTFS
    sb->s_export_op = &yaffs_export_ops;
#endif

    ... ...

    sb->s_blocksize      = PAGE_CACHE_SIZE;
    sb->s_blocksize_bits = PAGE_CACHE_SHIFT;

    ... ...

    /* Check it's an mtd device..... */
    if (MAJOR(sb->s_dev) != MTD_BLOCK_MAJOR)
        return NULL;    /* This isn't an mtd device */

/* --判断是mtd，开始mtd相关操作-- */

    /* Get the device */
    mtd = get_mtd_device(NULL, MINOR(sb->s_dev));    //MTD_BLOCK_MAJOR
    if (!mtd) {
        yaffs_trace(YAFFS_TRACE_ALWAYS,
            "yaffs: MTD device #%u doesn't appear to exist",
            MINOR(sb->s_dev));
        return NULL;
    }
    /* Check it's NAND */
    if (mtd->type != MTD_NANDFLASH) {
        yaffs_trace(YAFFS_TRACE_ALWAYS,
            "yaffs: MTD device is not NAND it's type %d",
            mtd->type);
        return NULL;
    }


/* 获得mtd之后，检查mtd各项及操作函数 */


    /* OK, so if we got here, we have an MTD that's NAND and looks
     * like it has the right capabilities
     * Set the struct yaffs_dev up for mtd
     */

    if (!read_only && !(mtd->flags & MTD_WRITEABLE)) {
        read_only = 1; 
        printk(KERN_INFO
               "yaffs: mtd is read only, setting superblock read only/n"
        );   
        sb->s_flags |= MS_RDONLY;
    }



/* struct yaffs_dev */

    dev     = kmalloc(sizeof(struct yaffs_dev), GFP_KERNEL);
    context = kmalloc(sizeof(struct yaffs_linux_context), GFP_KERNEL);
    ... ...

    dev->os_context = context;
    INIT_LIST_HEAD(&(context->context_list));
    context->dev = dev;
    context->super = sb;

    dev->read_only = read_only;

#if (LINUX_VERSION_CODE > KERNEL_VERSION(2, 5, 0))
    sb->s_fs_info = dev;
#else
    sb->u.generic_sbp = dev;
#endif

    dev->driver_context = mtd;

/* struct yaffs_param */

    param->name = mtd->name;

    /* Set up the memory size parameters.... */

    n_blocks =
        YCALCBLOCKS(mtd->size,
            (YAFFS_CHUNKS_PER_BLOCK * YAFFS_BYTES_PER_CHUNK));


/****************************************************************

#define YAFFS_BYTES_PER_SPARE       16
#define YAFFS_BYTES_PER_CHUNK       512
#define YAFFS_CHUNK_SIZE_SHIFT      9
#define YAFFS_CHUNKS_PER_BLOCK      32
#define YAFFS_BYTES_PER_BLOCK   (YAFFS_CHUNKS_PER_BLOCK*YAFFS_BYTES_PER_CHUNK)

 ****************************************************************/

    param->start_block 　        = 0;
    param->end_block             = n_blocks - 1;
    param->chunks_per_block      = YAFFS_CHUNKS_PER_BLOCK;        //32
    param->total_bytes_per_chunk = YAFFS_BYTES_PER_CHUNK;         //512
    param->n_reserved_blocks     = 5;
    param->n_caches              = (options.no_cache) ? 0 : 10;
    param->inband_tags           = options.inband_tags;

#ifdef CONFIG_YAFFS_DISABLE_LAZY_LOAD
    param->disable_lazy_load = 1;
#endif
#ifdef CONFIG_YAFFS_XATTR
    param->enable_xattr = 1;
#endif


/**
 * struct yaffs_param 的填充 
 * 包括yaffs所有信息，操作函数等。
 */

    /* ... and the functions. */
    if (yaffs_version == 2) {
        param->write_chunk_tags_fn     = nandmtd2_write_chunk_tags;
        param->read_chunk_tags_fn      = nandmtd2_read_chunk_tags;
        param->bad_block_fn            = nandmtd2_mark_block_bad;
        param->query_block_fn          = nandmtd2_query_block;
        yaffs_dev_to_lc(dev)->spare_buffer 
                                       = kmalloc(mtd->oobsize, GFP_NOFS);
        param->is_yaffs2 = 1;
#if (LINUX_VERSION_CODE > KERNEL_VERSION(2, 6, 17))
        param->total_bytes_per_chunk   = mtd->writesize;
        param->chunks_per_block        = mtd->erasesize / mtd->writesize;
#else
        param->total_bytes_per_chunk   = mtd->oobblock;
        param->chunks_per_block        = mtd->erasesize / mtd->oobblock;
#endif
        n_blocks = YCALCBLOCKS(mtd->size, mtd->erasesize);

        param->start_block = 0;
        param->end_block = n_blocks - 1;
    } else {
    ... ...
    }

    ... ...

    err = yaffs_guts_initialise(dev);    //对dev赋值并检验。yaffs_gut.c -->

    ... ...

    if (!inode)
        return NULL;

/*索引节点操作的接口函数*/

    inode->i_op     = &yaffs_dir_inode_operations;
    inode->i_fop    = &yaffs_dir_operations;

    yaffs_trace(YAFFS_TRACE_OS, "yaffs_read_super: got root inode");

    root = d_alloc_root(inode);

    yaffs_trace(YAFFS_TRACE_OS, "yaffs_read_super: d_alloc_root done");

    if (!root) {
        iput(inode);
        return NULL;
    }
    sb->s_root = root;
    sb->s_dirt = !dev->is_checkpointed;
    yaffs_trace(YAFFS_TRACE_ALWAYS,
        "yaffs_read_super: is_checkpointed %d",
        dev->is_checkpointed);

    yaffs_trace(YAFFS_TRACE_OS, "yaffs_read_super: done");
    return sb;
}

----> 完成对dev赋值并检验

int yaffs_guts_initialise(struct yaffs_dev *dev)   
{
    ... ...
    if (!init_failed && !yaffs_init_blocks(dev))

    yaffs_init_tnodes_and_objs(dev);

    if (!init_failed && !yaffs_create_initial_dir(dev))
        init_failed = 1;
    ... ...
}

--mtd层--

mtd->read( ) 正式进入mtd层的地界，又是在何时挂上具体的操作函数?

nand_scan，nand驱动中相当面熟的函数。即使看不到它，十有八九也会看到它封装的nand_scan_ident和nand_scan_tail。

打开nand_scan_tail瞧一瞧：

int nand_scan_tail(struct mtd_info *mtd)
{
    ... ...

    case NAND_ECC_HW:
        /* Use standard hwecc read page function ? */
        if (!chip->ecc.read_page)
            chip->ecc.read_page         = nand_read_page_hwecc;
        if (!chip->ecc.write_page)
            chip->ecc.write_page        = nand_write_page_hwecc;
        if (!chip->ecc.read_page_raw)
            chip->ecc.read_page_raw     = nand_read_page_raw;
        if (!chip->ecc.write_page_raw)
            chip->ecc.write_page_raw    = nand_write_page_raw;
        if (!chip->ecc.read_oob)
            chip->ecc.read_oob          = nand_read_oob_std;
        if (!chip->ecc.write_oob)
            chip->ecc.write_oob         = nand_write_oob_std;

    ... ...

    /* Fill in remaining MTD driver data */
    mtd->type        = MTD_NANDFLASH;
    mtd->flags       = (chip->options & NAND_ROM) ? MTD_CAP_ROM : MTD_CAP_NANDFLASH;
    mtd->erase       = nand_erase;
    mtd->point       = NULL;
    mtd->unpoint     = NULL;
    mtd->read        = nand_read;    //-->
    mtd->write       = nand_write;
    mtd->panic_write = panic_nand_write;
    mtd->read_oob    = nand_read_oob;
    mtd->write_oob   = nand_write_oob;
    mtd->sync        = nand_sync;
    mtd->lock        = NULL;
    mtd->unlock      = NULL;
    mtd->suspend     = nand_suspend;
    mtd->resume      = nand_resume;
    mtd->block_isbad       = nand_block_isbad;
    mtd->block_markbad     = nand_block_markbad;
    mtd->writebufsize      = mtd->writesize;

    /* propagate ecc.layout to mtd_info */
    mtd->ecclayout = chip->ecc.layout;

    /* Check, if we should skip the bad block table scan */
    if (chip->options & NAND_SKIP_BBTSCAN)
        return 0;

    /* Build bad block table */
    return chip->scan_bbt(mtd);
}

也正如注释所言：

/**
 * nand_scan_tail - [NAND Interface] Scan for the NAND device
 * @mtd:        MTD device structure
 *
 * This is the second phase of the normal nand_scan() function. It
 * fills out all the uninitialized function pointers with the defaults
 * and scans for a bad block table if appropriate.
 */

nand驱动中调用nand_scan，便为该nandFlash设备的mtd层提供了上层接口。

--drivers/mtd/nand/nand_base.c --

static int nand_read(struct mtd_info *mtd, loff_t from, size_t len,
                     size_t *retlen,  uint8_t *buf)
{
    struct nand_chip *chip = mtd->priv;
    int ret;

    /* Do not allow reads past end of device */
    if ((from + len) > mtd->size)
        return -EINVAL;
    if (!len)
        return 0;

    nand_get_device(chip, mtd, FL_READING);

    chip->ops.len    = len;
    chip->ops.datbuf = buf;
    chip->ops.oobbuf = NULL;

    ret = nand_do_read_ops(mtd, from, &chip->ops);    //Read data with ECC-->

    *retlen = chip->ops.retlen;

    nand_release_device(mtd);

    return ret;
}

---->

static int nand_do_read_ops(struct mtd_info *mtd, loff_t from,
                struct mtd_oob_ops *ops)
{

    while(1) {

    ... ...

            /* Now read the page into the buffer */
            if (unlikely(ops->mode == MTD_OOB_RAW))
                ret = chip->ecc.read_page_raw(mtd, chip,
                                  bufpoi, page);
            else if (!aligned && NAND_SUBPAGE_READ(chip) && !oob)
                ret = chip->ecc.read_subpage(mtd, chip,
                            col, bytes, bufpoi);
            else
                ret = chip->ecc.read_page(mtd, chip, bufpoi, page);    //-->
            if (ret < 0)
                break;
    ... ...
}

我X，又见回调！不过在之前的nand_scan_tail中已挂上了nand_read_page_hwecc。

--drivers/mtd/nand/nand_base.c --

static int nand_read_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip,
                uint8_t *buf, int page)
{
    int i, eccsize = chip->ecc.size;
    int eccbytes   = chip->ecc.bytes;
    int eccsteps   = chip->ecc.steps;
    uint8_t  *p = buf;
    uint8_t  *ecc_calc  = chip->buffers->ecccalc;
    uint8_t  *ecc_code  = chip->buffers->ecccode;
    uint32_t *eccpos    = chip->ecc.layout->eccpos;

/* 最终导向具体nand驱动中的xxx_read*/
    for (i = 0; eccsteps; eccsteps--, i += eccbytes, p += eccsize) {
        chip->ecc.hwctl(mtd, NAND_ECC_READ);
        chip->read_buf(mtd, p, eccsize);
        chip->ecc.calculate(mtd, p, &ecc_calc[i]);
    }
    chip->read_buf(mtd, chip->oob_poi, mtd->oobsize);

    for (i = 0; i < chip->ecc.total; i++)
        ecc_code[i] = chip->oob_poi[eccpos[i]];

    eccsteps = chip->ecc.steps;
    p = buf;

    for (i = 0 ; eccsteps; eccsteps--, i += eccbytes, p += eccsize) {
        int stat;

        stat = chip->ecc.correct(mtd, p, &ecc_code[i], &ecc_calc[i]);
        if (stat < 0)
            mtd->ecc_stats.failed++;
        else
            mtd->ecc_stats.corrected += stat;
    }
    return 0;
}

--硬件驱动层--

这里以davinci_nand.c为例，TI的一款构架。

部分代码：

^^^^^^^^^^^
info->dev   = &pdev->dev;
info->base  = base;
info->vaddr = vaddr;
^^^^^^^^^^^
info->mtd.priv = &info->chip;
info->mtd.name = dev_name(&pdev->dev);
info->mtd.owner = THIS_MODULE;
info->mtd.dev.parent = &pdev->dev;
^^^^^^^^^^^
info->chip.IO_ADDR_R   = vaddr;
info->chip.IO_ADDR_W   = vaddr;
info->chip.chip_delay  = 0;
info->chip.select_chip = nand_davinci_select_chip;
/* options such as NAND_USE_FLASH_BBT or 16-bit widths */
info->chip.options = pdata->options;
info->chip.bbt_td  = pdata->bbt_td;
info->chip.bbt_md  = pdata->bbt_md;
/* Set address of hardware control function */
info->chip.cmd_ctrl  = nand_davinci_hwcontrol;
info->chip.dev_ready = nand_davinci_dev_ready;

/* Speed up buffer I/O */
info->chip.read_buf  = nand_davinci_read_buf;　　//读　　-->
info->chip.write_buf = nand_davinci_write_buf;
^^^^^^^^^^^
info->ioaddr       = (uint32_t __force) vaddr;
info->current_cs   = info->ioaddr;
info->core_chipsel = pdev->id;
info->mask_chipsel = pdata->mask_chipsel;
/* use nandboot-capable ALE/CLE masks by default */
info->mask_ale = pdata->mask_ale ? : MASK_ALE;
info->mask_cle = pdata->mask_cle ? : MASK_CLE;

这部分的思路很清晰，就是填充info指向的代表nand设备的结构体，其中包含了对函数指针chip.read_buf 的赋值。

其中包含了两个结构体：mtd_info 和 nand_chip。

驱动代码中出现了nand_scan_ident和nand_scan_tail，也就是mtd层挂钩子的过程。

/* Scan to find existence of the device(s) */
ret = nand_scan_ident(&info->mtd, pdata->mask_chipsel ? 2 : 1);
if (ret < 0) {
    dev_dbg(&pdev->dev, "no NAND chip(s) found/n");
    goto err_scan;
}

... ...

ret = nand_scan_tail(&info->mtd);
if (ret < 0)
    goto err_scan;

驱动的结尾调用add_mtd_device函数向内核注册mtd设备。

-->

nand_davinci_read_buf是要我们自己去实现，参考使用nand的时序图，比如：

具体的说就是gpio的控制。

小小的看下该回调的函数：

static void nand_davinci_read_buf(struct mtd_info *mtd, uint8_t *buf, int len)
{
        struct nand_chip *chip = mtd->priv;

        if ((0x03 & ((unsigned)buf)) == 0 && (0x03 & len) == 0)
                ioread32_rep(chip->IO_ADDR_R, buf, len >> 2);
        else if ((0x01 & ((unsigned)buf)) == 0 && (0x01 & len) == 0)
                ioread16_rep(chip->IO_ADDR_R, buf, len >> 1);
        else
                ioread8_rep(chip->IO_ADDR_R, buf, len);
}

从寄存器读出相应位宽的数据，最后调到io口的“原子”操作_raw_read，_raw_write。

#define ioread32_rep(p,d,c)     __raw_readsl(p,d,c)

/*
 * Generic IO read/write.  These perform native-endian accesses.  Note
 * that some architectures will want to re-define __raw_{read,write}w.
 */
extern void __raw_readsb(const void __iomem *addr, void *data, int bytelen);
extern void __raw_readsw(const void __iomem *addr, void *data, int wordlen);
extern void __raw_readsl(const void __iomem *addr, void *data, int longlen);

raw_readsl这种基础的原子函数，汇编化是必须的。

  1 /*  2  *  linux/arch/arm/lib/io-readsb.S  3  *  4  *  Copyright (C) 1995-2000 Russell King  5  *  6  * This program is free software; you can redistribute it and/or modify  7  * it under the terms of the GNU General Public License version 2 as  8  * published by the Free Software Foundation.  9  */ 10 #include <linux/linkage.h> 11 #include <asm/assembler.h> 12  13 .Linsb_align:   rsb     ip, ip, #4 14                 cmp     ip, r2 15                 movgt   ip, r2 16                 cmp     ip, #2 17                 ldrb    r3, [r0] 18                 strb    r3, [r1], #1 19                 ldrgeb  r3, [r0] 20                 strgeb  r3, [r1], #1 21                 ldrgtb  r3, [r0] 22                 strgtb  r3, [r1], #1 23                 subs    r2, r2, ip 24                 bne     .Linsb_aligned 25  26 ENTRY(__raw_readsb) 27                 teq     r2, #0          @ do we have to check for the zero len? 28                 moveq   pc, lr 29                 ands    ip, r1, #3 30                 bne     .Linsb_align 31  32 .Linsb_aligned: stmfd   sp!, {r4 - r6, lr} 33  34                 subs    r2, r2, #16 35                 bmi     .Linsb_no_16 36  37 .Linsb_16_lp:   ldrb    r3, [r0] 38                 ldrb    r4, [r0] 39                 ldrb    r5, [r0] 40                 mov     r3, r3,     put_byte_0 41                 ldrb    r6, [r0] 42                 orr     r3, r3, r4, put_byte_1 43                 ldrb    r4, [r0] 44                 orr     r3, r3, r5, put_byte_2 45                 ldrb    r5, [r0] 46                 orr     r3, r3, r6, put_byte_3 47                 ldrb    r6, [r0] 48                 mov     r4, r4,     put_byte_0 49                 ldrb    ip, [r0] 50                 orr     r4, r4, r5, put_byte_1 51                 ldrb    r5, [r0] 52                 orr     r4, r4, r6, put_byte_2 53                 ldrb    r6, [r0] 54                 orr     r4, r4, ip, put_byte_3 55                 ldrb    ip, [r0] 56                 mov     r5, r5,     put_byte_0 57                 ldrb    lr, [r0] 58                 orr     r5, r5, r6, put_byte_1 59                 ldrb    r6, [r0] 60                 orr     r5, r5, ip, put_byte_2 61                 ldrb    ip, [r0] 62                 orr     r5, r5, lr, put_byte_3 63                 ldrb    lr, [r0] 64                 mov     r6, r6,     put_byte_0 65                 orr     r6, r6, ip, put_byte_1 66                 ldrb    ip, [r0] 67                 orr     r6, r6, lr, put_byte_2 68                 orr     r6, r6, ip, put_byte_3 69                 stmia   r1!, {r3 - r6} 70  71                 subs    r2, r2, #16 72                 bpl     .Linsb_16_lp 73  74                 tst     r2, #15 75                 ldmeqfd sp!, {r4 - r6, pc} 76  77 .Linsb_no_16:   tst     r2, #8 78                 beq     .Linsb_no_8 79  80                 ldrb    r3, [r0] 81                 ldrb    r4, [r0] 82                 ldrb    r5, [r0] 83                 mov     r3, r3,     put_byte_0 84                 ldrb    r6, [r0] 85                 orr     r3, r3, r4, put_byte_1 86                 ldrb    r4, [r0] 87                 orr     r3, r3, r5, put_byte_2 88                 ldrb    r5, [r0] 89                 orr     r3, r3, r6, put_byte_3 90                 ldrb    r6, [r0] 91                 mov     r4, r4,     put_byte_0 92                 ldrb    ip, [r0] 93                 orr     r4, r4, r5, put_byte_1 94                 orr     r4, r4, r6, put_byte_2 95                 orr     r4, r4, ip, put_byte_3 96                 stmia   r1!, {r3, r4} 97  98 .Linsb_no_8:    tst     r2, #4 99                 beq     .Linsb_no_4100 101                 ldrb    r3, [r0]102                 ldrb    r4, [r0]103                 ldrb    r5, [r0]104                 ldrb    r6, [r0]105                 mov     r3, r3,     put_byte_0106                 orr     r3, r3, r4, put_byte_1107                 orr     r3, r3, r5, put_byte_2108                 orr     r3, r3, r6, put_byte_3109                 str     r3, [r1], #4110 111 .Linsb_no_4:    ands    r2, r2, #3112                 ldmeqfd sp!, {r4 - r6, pc}113 114                 cmp     r2, #2115                 ldrb    r3, [r0]116                 strb    r3, [r1], #1117                 ldrgeb  r3, [r0]118                 strgeb  r3, [r1], #1119                 ldrgtb  r3, [r0]120                 strgtb  r3, [r1]121 122                 ldmfd   sp!, {r4 - r6, pc}123 ENDPROC(__raw_readsb)

OK，流水账完毕，粗略的浏览，过程中的每一部分都是一门学问，展开来去那就是一篇篇的论文。借一校友的的社区签名：“好的论文就像一个美女，研读论文的过程就是脱衣服的过程。”