# Glibc & VFS & FS & Device
-----
# Object-Oriented Programming With ANSI-C
> Object-oriented programming (OOP) is a programming paradigm based on the concept of "objects", which can contain **data**, in the form of fields (often known as attributes), and code, in the form of **procedures** (often known as methods).
* Inheritance
struct proc_inode {
struct pid *pid;
// ...
struct inode vfs_inode;
};
#define container_of(ptr, type, member) ({ \
const typeof( ((type *)0)->member ) *__mptr = (ptr); \
(type *)( (char *)__mptr - offsetof(type,member) );})
* Override
struct inode {
const struct file_operations *f_op;
const struct file_operations *i_fop; /* former ->i_op->default_file_ops */
// ...
void *i_private; /* fs or device private pointer */
}
# Glibc
> The GNU C Library, commonly known as **glibc**, is the GNU Project's implementation of the **C standard library**. Despite its name, it now also directly supports C++ (and, indirectly, other programming languages). It was started in the early 1990s by the **Free Software Foundation** (FSF) for their GNU operating system.
# A piece of Code
#include
#include
int main() {
char ch;
FILE *fp = fopen("./t.c", "r");
while ((ch = fgetc(fp)) != EOF) printf("%c", ch);
return 0;
}
-----
# Dive Into Glibc
FILE * _IO_new_fopen (const char *filename, const char *mode)
FILE * __fopen_internal (const char *filename, const char *mode, int is32)
versioned_symbol (libc, _IO_new_file_fopen, _IO_file_fopen, GLIBC_2_1);
_IO_new_file_fopen (FILE *fp, const char *filename, const char *mode, int is32not64)
FILE * _IO_file_open (FILE *fp, const char *filename, int posix_mode, int prot, int read_write, int is32not64)
extern int __open (const char *__file, int __oflag, ...);
strong_alias (__libc_open64, __open)
-----
# Dive Into Glibc
/* Open FILE with access OFLAG. If O_CREAT or O_TMPFILE is in OFLAG,
a third argument is the file protection. */
int
__libc_open64 (const char *file, int oflag, ...)
{
int mode = 0;
if (__OPEN_NEEDS_MODE (oflag))
{
va_list arg;
va_start (arg, oflag);
mode = va_arg (arg, int);
va_end (arg);
}
return SYSCALL_CANCEL (openat, AT_FDCWD, file, oflag | EXTRA_OPEN_FLAGS,
mode);
}
-----
# A Thousand Years Later
#undef internal_syscall0
#define internal_syscall0(number, err, dummy...) \
({ \
unsigned long int resultvar; \
asm volatile ( \
"syscall\n\t" \
: "=a" (resultvar) \
: "0" (number) \
: "memory", REGISTERS_CLOBBERED_BY_SYSCALL); \
(long int) resultvar; \
})
# Tour in Kernel : VFS
# Kernel: User Interface
* fs/open.c
SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
umode_t, mode)
{
if (force_o_largefile())
flags |= O_LARGEFILE;
return do_sys_open(dfd, filename, flags, mode);
}
* include/linux/syscall.h
#define SYSCALL_DEFINE0(sname) \
SYSCALL_METADATA(_##sname, 0); \
asmlinkage long sys_##sname(void)
#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
// ....
#define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)
-----
# Kernel: Current's fd
* fs/open.c
long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
|
|---> static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op)
|---> struct filename * getname_flags(const char __user *filename, int flags, int *empty)
|---> get_unused_fd_flags(flags);
|---> struct file *do_filp_open(int dfd, struct filename *pathname, const struct open_flags *op)
|---> static inline void fsnotify_open(struct file *file)
|---> void fd_install(unsigned int fd, struct file *file)
* fs/file.c
int get_unused_fd_flags(unsigned flags) {
return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags);
}
static int alloc_fd(unsigned start, unsigned flags) {
return __alloc_fd(current->files, start, rlimit(RLIMIT_NOFILE), flags);
}
/*
* allocate a file descriptor, mark it busy.
*/
int __alloc_fd(struct files_struct *files,
unsigned start, unsigned end, unsigned flags) {
-----
# Kernel : VFS
* fs/namei.c
struct file *do_filp_open(int dfd, struct filename *pathname,
const struct open_flags *op)
{
struct nameidata nd;
int flags = op->lookup_flags;
struct file *filp;
set_nameidata(&nd, dfd, pathname);
filp = path_openat(&nd, op, flags | LOOKUP_RCU);
if (unlikely(filp == ERR_PTR(-ECHILD)))
filp = path_openat(&nd, op, flags);
if (unlikely(filp == ERR_PTR(-ESTALE)))
filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
restore_nameidata();
return filp;
}
* fs/namei.c
struct nameidata {
struct path path;
struct qstr last;
struct path root;
struct inode *inode; /* path.dentry.d_inode */
unsigned int flags;
// ....
-----
# Kernel : VFS
* fs/namei.c
static struct file *path_openat(struct nameidata *nd,
const struct open_flags *op, unsigned flags)
{
// ....
while (!(error = link_path_walk(s, nd)) &&
(error = do_last(nd, file, op, &opened)) > 0) {
nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
s = trailing_symlink(nd);
if (IS_ERR(s)) {
error = PTR_ERR(s);
break;
}
}
// ....
-----
# Kernel : VFS
* fs/namei.c
/*
* Name resolution.
* This is the basic name resolution function, turning a pathname into
* the final dentry. We expect 'base' to be positive and a directory.
*
* Returns 0 and nd will have valid dentry and mnt on success.
* Returns error and drops reference to input namei data on failure.
*/
static int link_path_walk(const char *name, struct nameidata *nd)
/*
* Handle the last step of open()
*/
static int do_last(struct nameidata *nd,
struct file *file, const struct open_flags *op,
int *opened)
-----
# Kernel : VFS
* fs/namei.c
/* Fast lookup failed, do it the slow way */
static int lookup_slow(struct nameidata *nd, struct path *path)
{
struct dentry *dentry, *parent;
parent = nd->path.dentry;
BUG_ON(nd->inode != parent->d_inode);
mutex_lock(&parent->d_inode->i_mutex);
dentry = __lookup_hash(&nd->last, parent, nd->flags);
mutex_unlock(&parent->d_inode->i_mutex);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
path->mnt = nd->path.mnt;
path->dentry = dentry;
return follow_managed(path, nd);
}
static struct dentry *__lookup_hash(struct qstr *name,
struct dentry *base, unsigned int flags)
{
bool need_lookup;
struct dentry *dentry;
dentry = lookup_dcache(name, base, flags, &need_lookup);
if (!need_lookup)
return dentry;
return lookup_real(base->d_inode, dentry, flags);
}
-----
# Kernel : VFS
* fs/namei.c
static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
struct dentry *old;
/* Don't create child dentry for a dead directory. */
if (unlikely(IS_DEADDIR(dir))) {
dput(dentry);
return ERR_PTR(-ENOENT);
}
old = dir->i_op->lookup(dir, dentry, flags);
if (unlikely(old)) {
dput(dentry);
dentry = old;
}
return dentry;
}
# Tour in Kernel : Ext4
# Prerequisite Knowledge
## Inode
struct inode{
//...
const struct inode_operations *i_op;
//...
const struct file_operations *i_fop; /* former ->i_op->default_file_ops */
//...
union {
struct pipe_inode_info *i_pipe;
struct block_device *i_bdev;
struct cdev *i_cdev;
char *i_link;
};
//...
}
```
-----
# Prerequisite Knowledge
## file operation
struct file_operations {
struct module *owner;
loff_t (*llseek) (struct file *, loff_t, int);
ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
int (*iterate) (struct file *, struct dir_context *);
unsigned int (*poll) (struct file *, struct poll_table_struct *);
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
int (*mmap) (struct file *, struct vm_area_struct *);
int (*open) (struct inode *, struct file *);
int (*flush) (struct file *, fl_owner_t id);
int (*release) (struct inode *, struct file *);
int (*fsync) (struct file *, loff_t, loff_t, int datasync);
int (*aio_fsync) (struct kiocb *, int datasync);
int (*fasync) (int, struct file *, int);
int (*lock) (struct file *, int, struct file_lock *);
int (*check_flags)(int);
int (*flock) (struct file *, int, struct file_lock *);
int (*setlease)(struct file *, long, struct file_lock **, void **);
long (*fallocate)(struct file *file, int mode, loff_t offset,
loff_t len);
void (*show_fdinfo)(struct seq_file *m, struct file *f);
};
-----
# Prerequisite Knowledge
## Inode operation
struct inode_operations {
struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
const char * (*follow_link) (struct dentry *, void **);
int (*permission) (struct inode *, int);
struct posix_acl * (*get_acl)(struct inode *, int);
int (*readlink) (struct dentry *, char __user *,int);
void (*put_link) (struct inode *, void *);
int (*create) (struct inode *,struct dentry *, umode_t, bool);
int (*link) (struct dentry *,struct inode *,struct dentry *);
int (*unlink) (struct inode *,struct dentry *);
int (*symlink) (struct inode *,struct dentry *,const char *);
int (*mkdir) (struct inode *,struct dentry *,umode_t);
int (*rmdir) (struct inode *,struct dentry *);
int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t);
int (*rename) (struct inode *, struct dentry *,
struct inode *, struct dentry *);
int (*rename2) (struct inode *, struct dentry *,
struct inode *, struct dentry *, unsigned int);
} ____cacheline_aligned;
-----
# Ext4
const struct inode_operations ext4_dir_inode_operations = {
.create = ext4_create,
.lookup = ext4_lookup,
.link = ext4_link,
.unlink = ext4_unlink,
.symlink = ext4_symlink,
.mkdir = ext4_mkdir,
.rmdir = ext4_rmdir,
.mknod = ext4_mknod,
.tmpfile = ext4_tmpfile,
.rename2 = ext4_rename2,
.setattr = ext4_setattr,
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
.get_acl = ext4_get_acl,
.set_acl = ext4_set_acl,
.fiemap = ext4_fiemap,
};
const struct inode_operations ext4_special_inode_operations = {
.setattr = ext4_setattr,
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
.get_acl = ext4_get_acl,
.set_acl = ext4_set_acl,
};
-----
# Ext4
## Where the **operation**s are assigned
2. ext4 inode on the disk
struct ext4_inode {
__le16 i_mode; /* File mode */
__le16 i_uid; /* Low 16 bits of Owner Uid */
__le32 i_size_lo; /* Size in bytes */
__le32 i_atime; /* Access time */
__le32 i_ctime; /* Inode Change time */
__le32 i_mtime; /* Modification time */
__le32 i_dtime; /* Deletion Time */
__le16 i_gid; /* Low 16 bits of Group Id */
//...
3. ex4 inode in memory
struct ext4_inode_info {
//...
struct inode vfs_inode;
//...
1. create dir or file
static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
-----
# Ext4
* fs/ext4/namei.c
static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
static struct buffer_head * ext4_find_entry (struct inode *dir,
const struct qstr *d_name,
struct ext4_dir_entry_2 **res_dir,
int *inlined){
// ....
do {
block = dx_get_block(frame->at);
bh = ext4_read_dirblock(dir, block, DIRENT);
// ....
#define ext4_read_dirblock(inode, block, type) \
__ext4_read_dirblock((inode), (block), (type), __func__, __LINE__)
static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
ext4_lblk_t block,
dirblock_type_t type,
const char *func,
unsigned int line)
-----
# Ext4
* fs/ext4/namei.c
struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
ext4_lblk_t block, int map_flags);
|---> struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
ext4_lblk_t block, int map_flags)
|----> static inline struct buffer_head * sb_getblk(struct super_block *sb, sector_t block)
|----> struct buffer_head * __getblk_gfp(struct block_device *bdev, sector_t block,
unsigned size, gfp_t gfp)
Then, stuff with **block allocation**, **preallocation** and **buffer**
-----
# Ext4: Mount
static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
const char *dev_name, void *data)
{
return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
}
1. What's **dev_name** ?
2. What's the job of **mount\_bdev** ?
3. What's the job of **ext4\_fill\_super** ?
-----
# Ext4: ext4\_fill\_super
1. layout of ext4 disk
+----------------+------------------+------------------+------------------------+
| | | | |
| Boot Block | Block | Block | ... |
| | Group 0 | Group 0 | |
+----------------+------------------+------------------+------------------------+
+--------+----------------+----------+-------------+------------+--------------------------+
| | | | | | |
| Super | Group | Data | Inode | Inode | Data Blocks |
| block | Descriptor | Bitmap | Bitmap | Table | |
| | | | | | |
+--------+----------------+----------+-------------+------------+--------------------------+
2. ext super block
struct ext4_super_block {
/*00*/ __le32 s_inodes_count; /* Inodes count */
__le32 s_blocks_count_lo; /* Blocks count */
__le32 s_r_blocks_count_lo; /* Reserved blocks count */
__le32 s_free_blocks_count_lo; /* Free blocks count */
/*10*/ __le32 s_free_inodes_count; /* Free inodes count */
3. generic super block
struct super_block {
struct list_head s_list; /* Keep this first */
unsigned char s_blocksize_bits;
unsigned long s_blocksize;
struct file_system_type *s_type;
const struct super_operations *s_op;
-----
# Ext4: mount\_bdev
* fs/super.c
struct block_device *bdev;
struct super_block *s;
//....
bdev = blkdev_get_by_path(dev_name, mode, fs_type);
s = sget(fs_type, test_bdev_super, set_bdev_super, flags | MS_NOSEC,
bdev);
//....
* fs/super.c
static int set_bdev_super(struct super_block *s, void *data) {
s->s_bdev = data;
s->s_dev = s->s_bdev->bd_dev;
/*
* We set the bdi here to the queue backing, file systems can
* overwrite this in ->fill_super()
*/
s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info;
return 0;
}
-----
# Ext4: mount\_bdev
* fs/block_dev.c
struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
| void *holder)
|---> bdev = lookup_bdev(path);
| |---> error = kern_path(pathname, LOOKUP_FOLLOW, &path);
| |
| |---> filename_lookup(AT_FDCWD, getname_kernel(name), flags, path, NULL);
| |
| |----> retval = path_lookupat(&nd, flags | LOOKUP_RCU, path);
|
|---> err = blkdev_get(bdev, mode, holder);
**path_lookupat** and **path_openat**
**Everything is a file.**
-----
# Ext4: What is dev\_name
1. /etc/fstab
➜ linux-4.4.162 git:(master) ✗ cat /etc/fstab
#
UUID=10BC-2E85 /boot/efi vfat defaults,noatime 0 2
UUID=961b8b3f-926b-4ca3-aa9b-edf7946d6604 / ext4 defaults,noatime 0 1
2. df
➜ df -Th
Filesystem Type Size Used Avail Use% Mounted on
dev devtmpfs 7.8G 0 7.8G 0% /dev
run tmpfs 7.8G 1.5M 7.8G 1% /run
/dev/nvme0n1p5 ext4 115G 94G 15G 87% /
tmpfs tmpfs 7.8G 701M 7.1G 9% /dev/shm
tmpfs tmpfs 7.8G 0 7.8G 0% /sys/fs/cgroup
tmpfs tmpfs 7.8G 88M 7.7G 2% /tmp
/dev/nvme0n1p1 vfat 96M 50M 47M 52% /boot/efi
tmpfs tmpfs 1.6G 40K 1.6G 1% /run/user/1000
/dev/nvme0n1p3 fuseblk 121G 109G 13G 91% /run/media/shen/BECCDE17CCDDC9B3
# Tour in Kernel : Device
# Device
> use char device as example
struct cdev {
struct kobject kobj;
struct module *owner;
const struct file_operations *ops;
struct list_head list;
dev_t dev;
unsigned int count;
};
> Two **const struct file_operations *ops**
-----
# Device: the first fops
1. standard char fops
const struct file_operations def_chr_fops = {
.open = chrdev_open,
.llseek = noop_llseek,
};
2. init the fops with inode
void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
{
inode->i_mode = mode;
if (S_ISCHR(mode)) {
inode->i_fop = &def_chr_fops;
inode->i_rdev = rdev;
} else if (S_ISBLK(mode)) {
inode->i_fop = &def_blk_fops;
inode->i_rdev = rdev;
} else if (S_ISFIFO(mode))
inode->i_fop = &pipefifo_fops;
else if (S_ISSOCK(mode))
; /* leave it no_open_fops */
else
printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for"
" inode %s:%lu\n", mode, inode->i_sb->s_id,
inode->i_ino);
}
EXPORT_SYMBOL(init_special_inode);
-----
# Device: the second fops
1. memory fops
static const struct file_operations memory_fops = {
.open = memory_open,
.llseek = noop_llseek,
};
2. init the fops with inode
static int memory_open(struct inode *inode, struct file *filp)
{
int minor;
const struct memdev *dev;
minor = iminor(inode);
if (minor >= ARRAY_SIZE(devlist))
return -ENXIO;
dev = &devlist[minor];
if (!dev->fops)
return -ENXIO;
filp->f_op = dev->fops;
filp->f_mode |= dev->fmode;
if (dev->fops->open)
return dev->fops->open(inode, filp);
return 0;
}
-----
# Device : /dev/null
1. **/dev/null** fops
static const struct file_operations null_fops = {
.llseek = null_lseek,
.read = read_null,
.write = write_null,
.read_iter = read_iter_null,
.write_iter = write_iter_null,
.splice_write = splice_write_null,
};
2. write\_null does nothing
static ssize_t write_null(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
return count;
}
-----
# End : Tools
## [ccls](https://github.com/MaskRay/ccls)
+------------------------+ +---------------------------+ +-----------------------+
| | | | | |
| Atom | | coc.nvim | | |
| Emacs +--> | LanguageClient-neovim +--> | ccls |
| Vim/Neovim | | vim-lsp | | |
| Visual Studio Code | | | | |
| Monaco Editor | | | | |
+------------------------+ +---------------------------+ +-----------------------+
| | | | | |
| Editor | <--+ Editor Plugin | <--+ Language Server |
| | | | | |
| | | | | |
+------------------------+ +---------------------------+ +-----------------------+