Sysprog 16

download Sysprog 16

If you can't read please download the document

Transcript of Sysprog 16

  • 1. C/C++ Linux System Programming
      • Session 16
    • User-space System Programming
    • session 6

2. Outline

  • Filesystem concepts
  • File I/O Ops

3. Filesystem

  • Traditionally: An abstraction for storage device access
  • Why?
    • Common sensible organization
    • Encapsulate OS HW interaction, e.g. performance considerations

4. VFS

  • Wider-range abstraction:
    • special FS, different types of disk FS, network FS
    • Common user interface
    • Multiple FS's
    • Common handling

5. Mounts

  • Superblocks filesystem control block
  • Mount point
  • Syscalls
    • int mount(const char *source, const char *target, const char *filesystemtype, unsigned long mountflags, const void *data);
    • int umount(const char *target);

6. FS Objects and Metadata

  • Inode file control block
    • A unique ID
    • Access/Owner info
    • Memory maps
    • Block device info
  • Dirent file as a directory entry (not physical)
  • File file data and hook to meta (not physical)

7. Journaling

  • Problem:
    • operations on metadata are non-atomic, can be interrupted by power loss
  • Physical vs logical journals
  • Metadata-only journals

8. Disk Cache

  • Buffers
  • Page cache
  • Writeback pdflush
  • Read-ahead

9. File Descriptors

  • Descriptors index into process file table
  • int open(const char *pathname, int flags);
  • int open(const char *pathname, int flags, mode_t mode);
  • int creat(const char *pathname, mode_t mode);
    • Open with O_CREAT (disk files only)
  • int close(int fd); /* notice status !! */

10. File I/O modes

  • int fcntl(int fd, int cmd, long arg); // F_SETFL
  • Nonblocking: If not ready, EAGAIN - O_NONBLOCK
  • Synchronized: Wait until data is on HW - O_SYNC
    • int fsync(int fd);
  • Asynchronous: Signal when ready - O_ASYNC
    • SIGIO handler
    • fcntl: F_GETSIG / F_SETSIG, F_SETOWN/F_GETOWN (process getting signal)
  • Direct: Directly from user buffer - O_DIRECT

11. More File control

  • int unlink(const char *pathname);
  • int truncate(const char *path, off_t length);
    • int ftruncate(int fd, off_t length);
    • O_TRUNC on open

12. Descriptor I/O

  • ssize_t read(int fd, void *buf, size_t count);
  • ssize_t write(int fd, const void *buf, size_t count);
  • off_t lseek(int fd, off_t offset, int whence);
    • SEEK_SET, SEEK_CUR, SEEK_END
  • EOF

13. IO Vectors

  • ssize_t readv(int fd, const struct iovec *iov, int iovcnt);
  • ssize_t writev(int fd, const struct iovec *iov, int iovcnt);

struct iovec { void*iov_base;/* Starting address */ size_t iov_len;/* Number of bytes to transfer */ }; 14. int echo_main(int argc, char **argv) { struct iovec io[argc]; struct iovec *cur_io = io; char *arg; char *p; ... while (1) { int c; cur_io->iov_base = p = arg; ... while ((c = *arg++)) { if (c == eflag) { /* Check for escape seq. */ if (*arg == 'c') { /* 'c' means cancel newline and ignore all subsequent chars. */ cur_io->iov_len = p - (char*)cur_io->iov_base; cur_io++; goto ret; } ... c = bb_process_escape_sequence( (void*) &arg); } *p++ = c; } arg = *++argv; if (arg) *p++ = ' '; cur_io->iov_len = p - (char*)cur_io->iov_base; cur_io++; if (!arg) break; } ret: return writev(1, io, (cur_io - io)) >= 0; } 15. Memory Mapped file

  • void *mmap(void *start, size_t length, int prot, int flags, int fd, off_t offset);
  • int munmap(void *start, size_t length);
  • Important flags:
    • No anonymous, MAP_SHARED, MAP_FIXED, MAP_POPULATE ( | MAP_NONBLOCK)
  • int msync(void *start, size_t length, int flags); // MS_SYNC or MS_ASYNC
  • void *mremap(void *old_address, size_t old_size, size_t new_size, int flags);

16. Locking

  • Mandatory Locking (BSD)
    • ~S_IXGRP | SGID ( + mount flag MS_MANDLOCK)
    • Racy (mmap)
  • Advisory Locking
    • Both sides play nice

17. Advisory Locking

  • int flock(int fd, int operation); // LOCK_SH, LOCK_EX, LOCK_UN
  • int lockf(int fd, int cmd, off_t len); // F_LOCK, T_LOCK, F_ULOCK, F_TEST
  • fcntl: F_GETLK, F_SETLK, F_SETLKW
    • High level of control (with offset, down to a single byte)

struct flock { ... short l_type;/* Type of lock: F_RDLCK, F_WRLCK, F_UNLCK */ short l_whence;/* How to interpret l_start: SEEK_SET, SEEK_CUR, SEEK_END */ off_t l_start;/* Starting offset for lock */ off_t l_len;/* Number of bytes to lock */ pid_t l_pid;/* PID of process blocking our lock (F_GETLK only) */ ... }; 18. #ifdef F_SETLK #ifndef SEEK_SET #define SEEK_SET 0 #endif struct flock lock_data; lock_data.l_type = F_WRLCK; lock_data.l_whence = SEEK_SET; lock_data.l_start = lock_data.l_len = 0; if (fcntl(pidFd, F_SETLK, &lock_data) == -1) { if (errno == EAGAIN) return oldpid; else return -1; } #else #ifdef LOCK_EX if (flock (pidFd, LOCK_EX|LOCK_NB) == -1) { if (errno == EWOULDBLOCK) return oldpid; else return -1; } #else if (lockf (pidFd, F_TLOCK, 0) == -1) { if (errno == EACCES) return oldpid; else return -1; } #endif #endif } 19. Buffered I/O

  • Streams:Buffer I/O and write to kernel at once
    • Better alignment
    • Less system calls
    • Yet another cache!!
    • FILE *
    • Formatting
  • FILE *fopen(const char *path, const char *mode);
  • FILE *fdopen(int fd, const char *mode);
  • int fclose(FILE *fp);
  • int fileno(FILE *stream);

20. I/O

  • size_t fread(void *ptr, size_t size, size_t nmemb, FILE *stream);
  • size_t fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
  • Formatted
    • int fprintf(FILE *stream, const char *format, ...);
    • int fscanf(FILE *stream, const char *format, ...);
  • Char
    • int fputc(int c, FILE *stream);
    • int fgetc(FILE *stream);-- int ungetc(int c, FILE *stream);
  • String
    • int fputs(const char *s, FILE *stream);
    • char *fgets(char *s, int size, FILE *stream);

21. Behind the Scenes

  • Inherently thread-safe
  • To do your own locking (of the stream, not the file)
    • void flockfile(FILE *filehandle);
    • int ftrylockfile(FILE *filehandle);
    • void funlockfile(FILE *filehandle);
    • xxx_unlocked versions (e.g. fread_unlocked)
  • Flushing the stream (not the page cache)
    • int fflush(FILE *stream);

22. Errors

  • int feof(FILE *stream);
  • int ferror(FILE *stream);
  • void clearerr(FILE *stream);
  • Descriptor ops can not distinguish EOF vs error

23. Positioning

  • int fseek(FILE *stream, long offset, int whence);
  • long ftell(FILE *stream);
  • int fgetpos(FILE *stream, fpos_t *pos);
  • int fsetpos(FILE *stream, fpos_t *pos);

24. Metadata

  • int fstat(int fd, struct stat *buf);
  • int stat(const char *path, struct stat *buf);
    • lstat : BSD only
    • Exec on all nodes in path

struct stat { dev_tst_dev;/* ID of device containing file */ ino_tst_ino;/* inode number */ mode_tst_mode;/* protection */ nlink_tst_nlink;/* number of hard links */ uid_tst_uid;/* user ID of owner */ gid_tst_gid;/* group ID of owner */ dev_tst_rdev;/* device ID (if special file) */ off_tst_size;/* total size, in bytes */ blksize_t st_blksize; /* blocksize for filesystem I/O */ blkcnt_tst_blocks;/* number of blocks allocated */ time_tst_atime;/* time of last access */ time_tst_mtime;/* time of last modification */ time_tst_ctime;/* time of last status change */ }; 25. Directory Streams

  • A directory is a file whose entries are other inodes
  • DIR *opendir(const char *name);
  • int closedir(DIR *dir);
  • struct dirent *readdir(DIR *dir);

struct dirent { ino_td_ino;/* inode number */ off_td_off;/* offset to the next dirent */ unsigned short d_reclen;/* length of this record */ unsigned chard_type;/* type of file */ chard_name[256]; /* filename */ }; 26. static pid_list *scan_proc_pids(inode_list *ilist) { DIR *d; struct dirent *de; pid_t pid; pid_list *plist; xchdir("/proc"); d = opendir("/proc"); if (!d) return NULL; plist = NULL; while ((de = readdir(d)) != NULL) { pid = (pid_t)bb_strtou(de->d_name, NULL, 10); if (errno) continue; if (chdir(de->d_name) < 0) continue; plist = scan_link("cwd", pid, ilist, plist); plist = scan_link("exe", pid, ilist, plist); plist = scan_link("root", pid, ilist, plist); .... } closedir(d); return plist; } static pid_list *scan_link(const char *lname, pid_t pid, inode_list *ilist, pid_list *plist) { ino_t inode; dev_t dev; if (!file_to_dev_inode(lname, &dev, &inode)) return plist; if (search_dev_inode(ilist, dev, inode)) plist = add_pid(plist, pid); return plist; } static int file_to_dev_inode(const char *filename, dev_t *dev, ino_t *inode) { struct stat f_stat; if (stat(filename, &f_stat)) return 0; *inode = f_stat.st_ino; *dev = f_stat.st_dev; return 1; } static int search_dev_inode(inode_list *ilist, dev_t dev, ino_t inode) { while (ilist) { if (ilist->dev == dev) { if (option_mask32 & OPT_MOUNT) return 1; if (ilist->inode == inode) return 1; } ilist = ilist->next; } return 0; } 27. I/O Multiplexing

  • int select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeout);
  • int pselect(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, const struct timespec *timeout, const sigset_t *sigmask);
  • int poll(struct pollfd *fds, nfds_t nfds, int timeout);
  • int ppoll(struct pollfd *fds, nfds_vt nfds, const struct timespec *timeout, const sigset_t *sigmask);
    • POLLIN/POLLOUT/POLLPRI/POLLERR

void FD_CLR(int fd, fd_set *set); intFD_ISSET(int fd, fd_set *set); void FD_SET(int fd, fd_set *set); void FD_ZERO(fd_set *set); struct pollfd { intfd;/* file descriptor */ short events;/* requested events */ short revents;/* returned events */ }; 28. Epoll

  • Decouple interest set registration from poll
    • +: O(1) on the wait
    • +: Edge trigger
    • - : system call for adding onto the set
  • int epoll_create(int size); //desc, need close
  • int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);
  • int epoll_wait(int epfd, struct epoll_event *events, int maxevents, int timeout);

typedef union epoll_data { void*ptr; intfd; uint32_t u32; uint64_t u64; } epoll_data_t; struct epoll_event { uint32_tevents;/* Epoll events */ epoll_data_t data;/* User data variable */ }; 29. 30. IOCTL

  • Device / special file control
  • int ioctl(int d, int request, ...);
  • Request is specific to device being controlled, and may have a payload (ioctl_list)

31. Filesystem events

  • int inotify_init(void); // desc, need close
  • int inotify_add_watch(int fd, const char *pathname, uint32_t mask); // watch desc
  • int inotify_rm_watch(int fd, uint32_t wd);
  • FIONREAD ioctl
  • fcntl: F_NOTIFY

struct inotify_event { int wd;/* watch descriptor */ uint32_t mask;/* mask of events */ uint32_t cookie; /* unique cookie */ uint32_t len;/* size of 'name' field */ char name[];/* null-terminated name */ }; 32. int inotifyd_main(int argc UNUSED_PARAM, char **argv) { unsigned mask = IN_ALL_EVENTS; // assume we want all events struct pollfd pfd; char **watched = ++argv; // watched name list const char *args[] = { *argv, NULL, NULL, NULL, NULL }; // open inotify pfd.fd = inotify_init(); if (pfd.fd < 0) bb_perror_msg_and_die("no kernel support"); // setup watched while (*++argv) { char *path = *argv; char *masks = strchr(path, ':'); int wd; // watch descriptor // if mask is specified -> if (masks) { *masks = ''; // split path and mask // convert mask names to mask bitset mask = 0; while (*++masks) { int i = strchr(mask_names, *masks) - mask_names; if (i >= 0) { mask |= (1 0) { ssize_t len; void *buf; struct inotify_event *ie; // read out all pending events xioctl(pfd.fd, FIONREAD, &len); #define eventbuf bb_common_bufsiz1 ie = buf = (len 0) { int i; char events[12]; char *s = events; unsigned m = ie->mask; for (i = 0; i < 12; ++i, m >>= 1) { if (m & 1) { *s++ = mask_names[i]; } } *s = ''; args[1] = events; args[2] = watched[ie->wd]; args[3] = ie->len ? ie->name : NULL; xspawn((char **)args); // next event i = sizeof(struct inotify_event) + ie->len; len -= i; ie = (void*)((char*)ie + i); } if (eventbuf != buf) free(buf); } return EXIT_SUCCESS; } 33. Asynchronous I/O

  • Only on O_DIRECT

struct aiocb { int aio_filedes;/* file descriptor * int aio_lio_opcode;/* operation to perform */ int aio_reqprio;/* request priority offset * volatile void *aio_buf;/* pointer to buffer */ size_t aio_nbytes;/* length of operation */ struct sigevent aio_sigevent; /* signal number and value */ /* internal, private members follow... */ }; int aio_read (struct aiocb *aiocbp); int aio_write (struct aiocb *aiocbp); int aio_error (const struct aiocb *aiocbp); int aio_return (struct aiocb *aiocbp); int aio_cancel (int fd, struct aiocb *aiocbp); int aio_fsync (int op, struct aiocb *aiocbp); int aio_suspend (const struct aiocb * const cblist[], int n, const struct timespec *timeout);