Back to home page

LXR

 
 

    


0001 /*
0002  *  linux/fs/open.c
0003  *
0004  *  Copyright (C) 1991, 1992  Linus Torvalds
0005  */
0006 
0007 #include <linux/string.h>
0008 #include <linux/mm.h>
0009 #include <linux/file.h>
0010 #include <linux/fdtable.h>
0011 #include <linux/fsnotify.h>
0012 #include <linux/module.h>
0013 #include <linux/tty.h>
0014 #include <linux/namei.h>
0015 #include <linux/backing-dev.h>
0016 #include <linux/capability.h>
0017 #include <linux/securebits.h>
0018 #include <linux/security.h>
0019 #include <linux/mount.h>
0020 #include <linux/fcntl.h>
0021 #include <linux/slab.h>
0022 #include <linux/uaccess.h>
0023 #include <linux/fs.h>
0024 #include <linux/personality.h>
0025 #include <linux/pagemap.h>
0026 #include <linux/syscalls.h>
0027 #include <linux/rcupdate.h>
0028 #include <linux/audit.h>
0029 #include <linux/falloc.h>
0030 #include <linux/fs_struct.h>
0031 #include <linux/ima.h>
0032 #include <linux/dnotify.h>
0033 #include <linux/compat.h>
0034 
0035 #include "internal.h"
0036 
0037 int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
0038     struct file *filp)
0039 {
0040     int ret;
0041     struct iattr newattrs;
0042 
0043     /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */
0044     if (length < 0)
0045         return -EINVAL;
0046 
0047     newattrs.ia_size = length;
0048     newattrs.ia_valid = ATTR_SIZE | time_attrs;
0049     if (filp) {
0050         newattrs.ia_file = filp;
0051         newattrs.ia_valid |= ATTR_FILE;
0052     }
0053 
0054     /* Remove suid, sgid, and file capabilities on truncate too */
0055     ret = dentry_needs_remove_privs(dentry);
0056     if (ret < 0)
0057         return ret;
0058     if (ret)
0059         newattrs.ia_valid |= ret | ATTR_FORCE;
0060 
0061     inode_lock(dentry->d_inode);
0062     /* Note any delegations or leases have already been broken: */
0063     ret = notify_change(dentry, &newattrs, NULL);
0064     inode_unlock(dentry->d_inode);
0065     return ret;
0066 }
0067 
0068 long vfs_truncate(const struct path *path, loff_t length)
0069 {
0070     struct inode *inode;
0071     struct dentry *upperdentry;
0072     long error;
0073 
0074     inode = path->dentry->d_inode;
0075 
0076     /* For directories it's -EISDIR, for other non-regulars - -EINVAL */
0077     if (S_ISDIR(inode->i_mode))
0078         return -EISDIR;
0079     if (!S_ISREG(inode->i_mode))
0080         return -EINVAL;
0081 
0082     error = mnt_want_write(path->mnt);
0083     if (error)
0084         goto out;
0085 
0086     error = inode_permission(inode, MAY_WRITE);
0087     if (error)
0088         goto mnt_drop_write_and_out;
0089 
0090     error = -EPERM;
0091     if (IS_APPEND(inode))
0092         goto mnt_drop_write_and_out;
0093 
0094     /*
0095      * If this is an overlayfs then do as if opening the file so we get
0096      * write access on the upper inode, not on the overlay inode.  For
0097      * non-overlay filesystems d_real() is an identity function.
0098      */
0099     upperdentry = d_real(path->dentry, NULL, O_WRONLY);
0100     error = PTR_ERR(upperdentry);
0101     if (IS_ERR(upperdentry))
0102         goto mnt_drop_write_and_out;
0103 
0104     error = get_write_access(upperdentry->d_inode);
0105     if (error)
0106         goto mnt_drop_write_and_out;
0107 
0108     /*
0109      * Make sure that there are no leases.  get_write_access() protects
0110      * against the truncate racing with a lease-granting setlease().
0111      */
0112     error = break_lease(inode, O_WRONLY);
0113     if (error)
0114         goto put_write_and_out;
0115 
0116     error = locks_verify_truncate(inode, NULL, length);
0117     if (!error)
0118         error = security_path_truncate(path);
0119     if (!error)
0120         error = do_truncate(path->dentry, length, 0, NULL);
0121 
0122 put_write_and_out:
0123     put_write_access(upperdentry->d_inode);
0124 mnt_drop_write_and_out:
0125     mnt_drop_write(path->mnt);
0126 out:
0127     return error;
0128 }
0129 EXPORT_SYMBOL_GPL(vfs_truncate);
0130 
0131 static long do_sys_truncate(const char __user *pathname, loff_t length)
0132 {
0133     unsigned int lookup_flags = LOOKUP_FOLLOW;
0134     struct path path;
0135     int error;
0136 
0137     if (length < 0) /* sorry, but loff_t says... */
0138         return -EINVAL;
0139 
0140 retry:
0141     error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
0142     if (!error) {
0143         error = vfs_truncate(&path, length);
0144         path_put(&path);
0145     }
0146     if (retry_estale(error, lookup_flags)) {
0147         lookup_flags |= LOOKUP_REVAL;
0148         goto retry;
0149     }
0150     return error;
0151 }
0152 
0153 SYSCALL_DEFINE2(truncate, const char __user *, path, long, length)
0154 {
0155     return do_sys_truncate(path, length);
0156 }
0157 
0158 #ifdef CONFIG_COMPAT
0159 COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length)
0160 {
0161     return do_sys_truncate(path, length);
0162 }
0163 #endif
0164 
0165 static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
0166 {
0167     struct inode *inode;
0168     struct dentry *dentry;
0169     struct fd f;
0170     int error;
0171 
0172     error = -EINVAL;
0173     if (length < 0)
0174         goto out;
0175     error = -EBADF;
0176     f = fdget(fd);
0177     if (!f.file)
0178         goto out;
0179 
0180     /* explicitly opened as large or we are on 64-bit box */
0181     if (f.file->f_flags & O_LARGEFILE)
0182         small = 0;
0183 
0184     dentry = f.file->f_path.dentry;
0185     inode = dentry->d_inode;
0186     error = -EINVAL;
0187     if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE))
0188         goto out_putf;
0189 
0190     error = -EINVAL;
0191     /* Cannot ftruncate over 2^31 bytes without large file support */
0192     if (small && length > MAX_NON_LFS)
0193         goto out_putf;
0194 
0195     error = -EPERM;
0196     if (IS_APPEND(inode))
0197         goto out_putf;
0198 
0199     sb_start_write(inode->i_sb);
0200     error = locks_verify_truncate(inode, f.file, length);
0201     if (!error)
0202         error = security_path_truncate(&f.file->f_path);
0203     if (!error)
0204         error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, f.file);
0205     sb_end_write(inode->i_sb);
0206 out_putf:
0207     fdput(f);
0208 out:
0209     return error;
0210 }
0211 
0212 SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length)
0213 {
0214     return do_sys_ftruncate(fd, length, 1);
0215 }
0216 
0217 #ifdef CONFIG_COMPAT
0218 COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length)
0219 {
0220     return do_sys_ftruncate(fd, length, 1);
0221 }
0222 #endif
0223 
0224 /* LFS versions of truncate are only needed on 32 bit machines */
0225 #if BITS_PER_LONG == 32
0226 SYSCALL_DEFINE2(truncate64, const char __user *, path, loff_t, length)
0227 {
0228     return do_sys_truncate(path, length);
0229 }
0230 
0231 SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length)
0232 {
0233     return do_sys_ftruncate(fd, length, 0);
0234 }
0235 #endif /* BITS_PER_LONG == 32 */
0236 
0237 
0238 int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
0239 {
0240     struct inode *inode = file_inode(file);
0241     long ret;
0242 
0243     if (offset < 0 || len <= 0)
0244         return -EINVAL;
0245 
0246     /* Return error if mode is not supported */
0247     if (mode & ~FALLOC_FL_SUPPORTED_MASK)
0248         return -EOPNOTSUPP;
0249 
0250     /* Punch hole and zero range are mutually exclusive */
0251     if ((mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) ==
0252         (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))
0253         return -EOPNOTSUPP;
0254 
0255     /* Punch hole must have keep size set */
0256     if ((mode & FALLOC_FL_PUNCH_HOLE) &&
0257         !(mode & FALLOC_FL_KEEP_SIZE))
0258         return -EOPNOTSUPP;
0259 
0260     /* Collapse range should only be used exclusively. */
0261     if ((mode & FALLOC_FL_COLLAPSE_RANGE) &&
0262         (mode & ~FALLOC_FL_COLLAPSE_RANGE))
0263         return -EINVAL;
0264 
0265     /* Insert range should only be used exclusively. */
0266     if ((mode & FALLOC_FL_INSERT_RANGE) &&
0267         (mode & ~FALLOC_FL_INSERT_RANGE))
0268         return -EINVAL;
0269 
0270     /* Unshare range should only be used with allocate mode. */
0271     if ((mode & FALLOC_FL_UNSHARE_RANGE) &&
0272         (mode & ~(FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE)))
0273         return -EINVAL;
0274 
0275     if (!(file->f_mode & FMODE_WRITE))
0276         return -EBADF;
0277 
0278     /*
0279      * We can only allow pure fallocate on append only files
0280      */
0281     if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode))
0282         return -EPERM;
0283 
0284     if (IS_IMMUTABLE(inode))
0285         return -EPERM;
0286 
0287     /*
0288      * We cannot allow any fallocate operation on an active swapfile
0289      */
0290     if (IS_SWAPFILE(inode))
0291         return -ETXTBSY;
0292 
0293     /*
0294      * Revalidate the write permissions, in case security policy has
0295      * changed since the files were opened.
0296      */
0297     ret = security_file_permission(file, MAY_WRITE);
0298     if (ret)
0299         return ret;
0300 
0301     if (S_ISFIFO(inode->i_mode))
0302         return -ESPIPE;
0303 
0304     /*
0305      * Let individual file system decide if it supports preallocation
0306      * for directories or not.
0307      */
0308     if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) &&
0309         !S_ISBLK(inode->i_mode))
0310         return -ENODEV;
0311 
0312     /* Check for wrap through zero too */
0313     if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
0314         return -EFBIG;
0315 
0316     if (!file->f_op->fallocate)
0317         return -EOPNOTSUPP;
0318 
0319     sb_start_write(inode->i_sb);
0320     ret = file->f_op->fallocate(file, mode, offset, len);
0321 
0322     /*
0323      * Create inotify and fanotify events.
0324      *
0325      * To keep the logic simple always create events if fallocate succeeds.
0326      * This implies that events are even created if the file size remains
0327      * unchanged, e.g. when using flag FALLOC_FL_KEEP_SIZE.
0328      */
0329     if (ret == 0)
0330         fsnotify_modify(file);
0331 
0332     sb_end_write(inode->i_sb);
0333     return ret;
0334 }
0335 EXPORT_SYMBOL_GPL(vfs_fallocate);
0336 
0337 SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
0338 {
0339     struct fd f = fdget(fd);
0340     int error = -EBADF;
0341 
0342     if (f.file) {
0343         error = vfs_fallocate(f.file, mode, offset, len);
0344         fdput(f);
0345     }
0346     return error;
0347 }
0348 
0349 /*
0350  * access() needs to use the real uid/gid, not the effective uid/gid.
0351  * We do this by temporarily clearing all FS-related capabilities and
0352  * switching the fsuid/fsgid around to the real ones.
0353  */
0354 SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
0355 {
0356     const struct cred *old_cred;
0357     struct cred *override_cred;
0358     struct path path;
0359     struct inode *inode;
0360     int res;
0361     unsigned int lookup_flags = LOOKUP_FOLLOW;
0362 
0363     if (mode & ~S_IRWXO)    /* where's F_OK, X_OK, W_OK, R_OK? */
0364         return -EINVAL;
0365 
0366     override_cred = prepare_creds();
0367     if (!override_cred)
0368         return -ENOMEM;
0369 
0370     override_cred->fsuid = override_cred->uid;
0371     override_cred->fsgid = override_cred->gid;
0372 
0373     if (!issecure(SECURE_NO_SETUID_FIXUP)) {
0374         /* Clear the capabilities if we switch to a non-root user */
0375         kuid_t root_uid = make_kuid(override_cred->user_ns, 0);
0376         if (!uid_eq(override_cred->uid, root_uid))
0377             cap_clear(override_cred->cap_effective);
0378         else
0379             override_cred->cap_effective =
0380                 override_cred->cap_permitted;
0381     }
0382 
0383     old_cred = override_creds(override_cred);
0384 retry:
0385     res = user_path_at(dfd, filename, lookup_flags, &path);
0386     if (res)
0387         goto out;
0388 
0389     inode = d_backing_inode(path.dentry);
0390 
0391     if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
0392         /*
0393          * MAY_EXEC on regular files is denied if the fs is mounted
0394          * with the "noexec" flag.
0395          */
0396         res = -EACCES;
0397         if (path_noexec(&path))
0398             goto out_path_release;
0399     }
0400 
0401     res = inode_permission(inode, mode | MAY_ACCESS);
0402     /* SuS v2 requires we report a read only fs too */
0403     if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
0404         goto out_path_release;
0405     /*
0406      * This is a rare case where using __mnt_is_readonly()
0407      * is OK without a mnt_want/drop_write() pair.  Since
0408      * no actual write to the fs is performed here, we do
0409      * not need to telegraph to that to anyone.
0410      *
0411      * By doing this, we accept that this access is
0412      * inherently racy and know that the fs may change
0413      * state before we even see this result.
0414      */
0415     if (__mnt_is_readonly(path.mnt))
0416         res = -EROFS;
0417 
0418 out_path_release:
0419     path_put(&path);
0420     if (retry_estale(res, lookup_flags)) {
0421         lookup_flags |= LOOKUP_REVAL;
0422         goto retry;
0423     }
0424 out:
0425     revert_creds(old_cred);
0426     put_cred(override_cred);
0427     return res;
0428 }
0429 
0430 SYSCALL_DEFINE2(access, const char __user *, filename, int, mode)
0431 {
0432     return sys_faccessat(AT_FDCWD, filename, mode);
0433 }
0434 
0435 SYSCALL_DEFINE1(chdir, const char __user *, filename)
0436 {
0437     struct path path;
0438     int error;
0439     unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
0440 retry:
0441     error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
0442     if (error)
0443         goto out;
0444 
0445     error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
0446     if (error)
0447         goto dput_and_out;
0448 
0449     set_fs_pwd(current->fs, &path);
0450 
0451 dput_and_out:
0452     path_put(&path);
0453     if (retry_estale(error, lookup_flags)) {
0454         lookup_flags |= LOOKUP_REVAL;
0455         goto retry;
0456     }
0457 out:
0458     return error;
0459 }
0460 
0461 SYSCALL_DEFINE1(fchdir, unsigned int, fd)
0462 {
0463     struct fd f = fdget_raw(fd);
0464     struct inode *inode;
0465     int error = -EBADF;
0466 
0467     error = -EBADF;
0468     if (!f.file)
0469         goto out;
0470 
0471     inode = file_inode(f.file);
0472 
0473     error = -ENOTDIR;
0474     if (!S_ISDIR(inode->i_mode))
0475         goto out_putf;
0476 
0477     error = inode_permission(inode, MAY_EXEC | MAY_CHDIR);
0478     if (!error)
0479         set_fs_pwd(current->fs, &f.file->f_path);
0480 out_putf:
0481     fdput(f);
0482 out:
0483     return error;
0484 }
0485 
0486 SYSCALL_DEFINE1(chroot, const char __user *, filename)
0487 {
0488     struct path path;
0489     int error;
0490     unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
0491 retry:
0492     error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
0493     if (error)
0494         goto out;
0495 
0496     error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
0497     if (error)
0498         goto dput_and_out;
0499 
0500     error = -EPERM;
0501     if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT))
0502         goto dput_and_out;
0503     error = security_path_chroot(&path);
0504     if (error)
0505         goto dput_and_out;
0506 
0507     set_fs_root(current->fs, &path);
0508     error = 0;
0509 dput_and_out:
0510     path_put(&path);
0511     if (retry_estale(error, lookup_flags)) {
0512         lookup_flags |= LOOKUP_REVAL;
0513         goto retry;
0514     }
0515 out:
0516     return error;
0517 }
0518 
0519 static int chmod_common(const struct path *path, umode_t mode)
0520 {
0521     struct inode *inode = path->dentry->d_inode;
0522     struct inode *delegated_inode = NULL;
0523     struct iattr newattrs;
0524     int error;
0525 
0526     error = mnt_want_write(path->mnt);
0527     if (error)
0528         return error;
0529 retry_deleg:
0530     inode_lock(inode);
0531     error = security_path_chmod(path, mode);
0532     if (error)
0533         goto out_unlock;
0534     newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
0535     newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
0536     error = notify_change(path->dentry, &newattrs, &delegated_inode);
0537 out_unlock:
0538     inode_unlock(inode);
0539     if (delegated_inode) {
0540         error = break_deleg_wait(&delegated_inode);
0541         if (!error)
0542             goto retry_deleg;
0543     }
0544     mnt_drop_write(path->mnt);
0545     return error;
0546 }
0547 
0548 SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
0549 {
0550     struct fd f = fdget(fd);
0551     int err = -EBADF;
0552 
0553     if (f.file) {
0554         audit_file(f.file);
0555         err = chmod_common(&f.file->f_path, mode);
0556         fdput(f);
0557     }
0558     return err;
0559 }
0560 
0561 SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, umode_t, mode)
0562 {
0563     struct path path;
0564     int error;
0565     unsigned int lookup_flags = LOOKUP_FOLLOW;
0566 retry:
0567     error = user_path_at(dfd, filename, lookup_flags, &path);
0568     if (!error) {
0569         error = chmod_common(&path, mode);
0570         path_put(&path);
0571         if (retry_estale(error, lookup_flags)) {
0572             lookup_flags |= LOOKUP_REVAL;
0573             goto retry;
0574         }
0575     }
0576     return error;
0577 }
0578 
0579 SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode)
0580 {
0581     return sys_fchmodat(AT_FDCWD, filename, mode);
0582 }
0583 
0584 static int chown_common(const struct path *path, uid_t user, gid_t group)
0585 {
0586     struct inode *inode = path->dentry->d_inode;
0587     struct inode *delegated_inode = NULL;
0588     int error;
0589     struct iattr newattrs;
0590     kuid_t uid;
0591     kgid_t gid;
0592 
0593     uid = make_kuid(current_user_ns(), user);
0594     gid = make_kgid(current_user_ns(), group);
0595 
0596 retry_deleg:
0597     newattrs.ia_valid =  ATTR_CTIME;
0598     if (user != (uid_t) -1) {
0599         if (!uid_valid(uid))
0600             return -EINVAL;
0601         newattrs.ia_valid |= ATTR_UID;
0602         newattrs.ia_uid = uid;
0603     }
0604     if (group != (gid_t) -1) {
0605         if (!gid_valid(gid))
0606             return -EINVAL;
0607         newattrs.ia_valid |= ATTR_GID;
0608         newattrs.ia_gid = gid;
0609     }
0610     if (!S_ISDIR(inode->i_mode))
0611         newattrs.ia_valid |=
0612             ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
0613     inode_lock(inode);
0614     error = security_path_chown(path, uid, gid);
0615     if (!error)
0616         error = notify_change(path->dentry, &newattrs, &delegated_inode);
0617     inode_unlock(inode);
0618     if (delegated_inode) {
0619         error = break_deleg_wait(&delegated_inode);
0620         if (!error)
0621             goto retry_deleg;
0622     }
0623     return error;
0624 }
0625 
0626 SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
0627         gid_t, group, int, flag)
0628 {
0629     struct path path;
0630     int error = -EINVAL;
0631     int lookup_flags;
0632 
0633     if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
0634         goto out;
0635 
0636     lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
0637     if (flag & AT_EMPTY_PATH)
0638         lookup_flags |= LOOKUP_EMPTY;
0639 retry:
0640     error = user_path_at(dfd, filename, lookup_flags, &path);
0641     if (error)
0642         goto out;
0643     error = mnt_want_write(path.mnt);
0644     if (error)
0645         goto out_release;
0646     error = chown_common(&path, user, group);
0647     mnt_drop_write(path.mnt);
0648 out_release:
0649     path_put(&path);
0650     if (retry_estale(error, lookup_flags)) {
0651         lookup_flags |= LOOKUP_REVAL;
0652         goto retry;
0653     }
0654 out:
0655     return error;
0656 }
0657 
0658 SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group)
0659 {
0660     return sys_fchownat(AT_FDCWD, filename, user, group, 0);
0661 }
0662 
0663 SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group)
0664 {
0665     return sys_fchownat(AT_FDCWD, filename, user, group,
0666                 AT_SYMLINK_NOFOLLOW);
0667 }
0668 
0669 SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
0670 {
0671     struct fd f = fdget(fd);
0672     int error = -EBADF;
0673 
0674     if (!f.file)
0675         goto out;
0676 
0677     error = mnt_want_write_file(f.file);
0678     if (error)
0679         goto out_fput;
0680     audit_file(f.file);
0681     error = chown_common(&f.file->f_path, user, group);
0682     mnt_drop_write_file(f.file);
0683 out_fput:
0684     fdput(f);
0685 out:
0686     return error;
0687 }
0688 
0689 int open_check_o_direct(struct file *f)
0690 {
0691     /* NB: we're sure to have correct a_ops only after f_op->open */
0692     if (f->f_flags & O_DIRECT) {
0693         if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)
0694             return -EINVAL;
0695     }
0696     return 0;
0697 }
0698 
0699 static int do_dentry_open(struct file *f,
0700               struct inode *inode,
0701               int (*open)(struct inode *, struct file *),
0702               const struct cred *cred)
0703 {
0704     static const struct file_operations empty_fops = {};
0705     int error;
0706 
0707     f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
0708                 FMODE_PREAD | FMODE_PWRITE;
0709 
0710     path_get(&f->f_path);
0711     f->f_inode = inode;
0712     f->f_mapping = inode->i_mapping;
0713 
0714     if (unlikely(f->f_flags & O_PATH)) {
0715         f->f_mode = FMODE_PATH;
0716         f->f_op = &empty_fops;
0717         return 0;
0718     }
0719 
0720     if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
0721         error = get_write_access(inode);
0722         if (unlikely(error))
0723             goto cleanup_file;
0724         error = __mnt_want_write(f->f_path.mnt);
0725         if (unlikely(error)) {
0726             put_write_access(inode);
0727             goto cleanup_file;
0728         }
0729         f->f_mode |= FMODE_WRITER;
0730     }
0731 
0732     /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
0733     if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))
0734         f->f_mode |= FMODE_ATOMIC_POS;
0735 
0736     f->f_op = fops_get(inode->i_fop);
0737     if (unlikely(WARN_ON(!f->f_op))) {
0738         error = -ENODEV;
0739         goto cleanup_all;
0740     }
0741 
0742     error = security_file_open(f, cred);
0743     if (error)
0744         goto cleanup_all;
0745 
0746     error = break_lease(locks_inode(f), f->f_flags);
0747     if (error)
0748         goto cleanup_all;
0749 
0750     if (!open)
0751         open = f->f_op->open;
0752     if (open) {
0753         error = open(inode, f);
0754         if (error)
0755             goto cleanup_all;
0756     }
0757     if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
0758         i_readcount_inc(inode);
0759     if ((f->f_mode & FMODE_READ) &&
0760          likely(f->f_op->read || f->f_op->read_iter))
0761         f->f_mode |= FMODE_CAN_READ;
0762     if ((f->f_mode & FMODE_WRITE) &&
0763          likely(f->f_op->write || f->f_op->write_iter))
0764         f->f_mode |= FMODE_CAN_WRITE;
0765 
0766     f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
0767 
0768     file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
0769 
0770     return 0;
0771 
0772 cleanup_all:
0773     fops_put(f->f_op);
0774     if (f->f_mode & FMODE_WRITER) {
0775         put_write_access(inode);
0776         __mnt_drop_write(f->f_path.mnt);
0777     }
0778 cleanup_file:
0779     path_put(&f->f_path);
0780     f->f_path.mnt = NULL;
0781     f->f_path.dentry = NULL;
0782     f->f_inode = NULL;
0783     return error;
0784 }
0785 
0786 /**
0787  * finish_open - finish opening a file
0788  * @file: file pointer
0789  * @dentry: pointer to dentry
0790  * @open: open callback
0791  * @opened: state of open
0792  *
0793  * This can be used to finish opening a file passed to i_op->atomic_open().
0794  *
0795  * If the open callback is set to NULL, then the standard f_op->open()
0796  * filesystem callback is substituted.
0797  *
0798  * NB: the dentry reference is _not_ consumed.  If, for example, the dentry is
0799  * the return value of d_splice_alias(), then the caller needs to perform dput()
0800  * on it after finish_open().
0801  *
0802  * On successful return @file is a fully instantiated open file.  After this, if
0803  * an error occurs in ->atomic_open(), it needs to clean up with fput().
0804  *
0805  * Returns zero on success or -errno if the open failed.
0806  */
0807 int finish_open(struct file *file, struct dentry *dentry,
0808         int (*open)(struct inode *, struct file *),
0809         int *opened)
0810 {
0811     int error;
0812     BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
0813 
0814     file->f_path.dentry = dentry;
0815     error = do_dentry_open(file, d_backing_inode(dentry), open,
0816                    current_cred());
0817     if (!error)
0818         *opened |= FILE_OPENED;
0819 
0820     return error;
0821 }
0822 EXPORT_SYMBOL(finish_open);
0823 
0824 /**
0825  * finish_no_open - finish ->atomic_open() without opening the file
0826  *
0827  * @file: file pointer
0828  * @dentry: dentry or NULL (as returned from ->lookup())
0829  *
0830  * This can be used to set the result of a successful lookup in ->atomic_open().
0831  *
0832  * NB: unlike finish_open() this function does consume the dentry reference and
0833  * the caller need not dput() it.
0834  *
0835  * Returns "1" which must be the return value of ->atomic_open() after having
0836  * called this function.
0837  */
0838 int finish_no_open(struct file *file, struct dentry *dentry)
0839 {
0840     file->f_path.dentry = dentry;
0841     return 1;
0842 }
0843 EXPORT_SYMBOL(finish_no_open);
0844 
0845 char *file_path(struct file *filp, char *buf, int buflen)
0846 {
0847     return d_path(&filp->f_path, buf, buflen);
0848 }
0849 EXPORT_SYMBOL(file_path);
0850 
0851 /**
0852  * vfs_open - open the file at the given path
0853  * @path: path to open
0854  * @file: newly allocated file with f_flag initialized
0855  * @cred: credentials to use
0856  */
0857 int vfs_open(const struct path *path, struct file *file,
0858          const struct cred *cred)
0859 {
0860     struct dentry *dentry = d_real(path->dentry, NULL, file->f_flags);
0861 
0862     if (IS_ERR(dentry))
0863         return PTR_ERR(dentry);
0864 
0865     file->f_path = *path;
0866     return do_dentry_open(file, d_backing_inode(dentry), NULL, cred);
0867 }
0868 
0869 struct file *dentry_open(const struct path *path, int flags,
0870              const struct cred *cred)
0871 {
0872     int error;
0873     struct file *f;
0874 
0875     validate_creds(cred);
0876 
0877     /* We must always pass in a valid mount pointer. */
0878     BUG_ON(!path->mnt);
0879 
0880     f = get_empty_filp();
0881     if (!IS_ERR(f)) {
0882         f->f_flags = flags;
0883         error = vfs_open(path, f, cred);
0884         if (!error) {
0885             /* from now on we need fput() to dispose of f */
0886             error = open_check_o_direct(f);
0887             if (error) {
0888                 fput(f);
0889                 f = ERR_PTR(error);
0890             }
0891         } else { 
0892             put_filp(f);
0893             f = ERR_PTR(error);
0894         }
0895     }
0896     return f;
0897 }
0898 EXPORT_SYMBOL(dentry_open);
0899 
0900 static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op)
0901 {
0902     int lookup_flags = 0;
0903     int acc_mode = ACC_MODE(flags);
0904 
0905     if (flags & (O_CREAT | __O_TMPFILE))
0906         op->mode = (mode & S_IALLUGO) | S_IFREG;
0907     else
0908         op->mode = 0;
0909 
0910     /* Must never be set by userspace */
0911     flags &= ~FMODE_NONOTIFY & ~O_CLOEXEC;
0912 
0913     /*
0914      * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
0915      * check for O_DSYNC if the need any syncing at all we enforce it's
0916      * always set instead of having to deal with possibly weird behaviour
0917      * for malicious applications setting only __O_SYNC.
0918      */
0919     if (flags & __O_SYNC)
0920         flags |= O_DSYNC;
0921 
0922     if (flags & __O_TMPFILE) {
0923         if ((flags & O_TMPFILE_MASK) != O_TMPFILE)
0924             return -EINVAL;
0925         if (!(acc_mode & MAY_WRITE))
0926             return -EINVAL;
0927     } else if (flags & O_PATH) {
0928         /*
0929          * If we have O_PATH in the open flag. Then we
0930          * cannot have anything other than the below set of flags
0931          */
0932         flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
0933         acc_mode = 0;
0934     }
0935 
0936     op->open_flag = flags;
0937 
0938     /* O_TRUNC implies we need access checks for write permissions */
0939     if (flags & O_TRUNC)
0940         acc_mode |= MAY_WRITE;
0941 
0942     /* Allow the LSM permission hook to distinguish append
0943        access from general write access. */
0944     if (flags & O_APPEND)
0945         acc_mode |= MAY_APPEND;
0946 
0947     op->acc_mode = acc_mode;
0948 
0949     op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;
0950 
0951     if (flags & O_CREAT) {
0952         op->intent |= LOOKUP_CREATE;
0953         if (flags & O_EXCL)
0954             op->intent |= LOOKUP_EXCL;
0955     }
0956 
0957     if (flags & O_DIRECTORY)
0958         lookup_flags |= LOOKUP_DIRECTORY;
0959     if (!(flags & O_NOFOLLOW))
0960         lookup_flags |= LOOKUP_FOLLOW;
0961     op->lookup_flags = lookup_flags;
0962     return 0;
0963 }
0964 
0965 /**
0966  * file_open_name - open file and return file pointer
0967  *
0968  * @name:   struct filename containing path to open
0969  * @flags:  open flags as per the open(2) second argument
0970  * @mode:   mode for the new file if O_CREAT is set, else ignored
0971  *
0972  * This is the helper to open a file from kernelspace if you really
0973  * have to.  But in generally you should not do this, so please move
0974  * along, nothing to see here..
0975  */
0976 struct file *file_open_name(struct filename *name, int flags, umode_t mode)
0977 {
0978     struct open_flags op;
0979     int err = build_open_flags(flags, mode, &op);
0980     return err ? ERR_PTR(err) : do_filp_open(AT_FDCWD, name, &op);
0981 }
0982 
0983 /**
0984  * filp_open - open file and return file pointer
0985  *
0986  * @filename:   path to open
0987  * @flags:  open flags as per the open(2) second argument
0988  * @mode:   mode for the new file if O_CREAT is set, else ignored
0989  *
0990  * This is the helper to open a file from kernelspace if you really
0991  * have to.  But in generally you should not do this, so please move
0992  * along, nothing to see here..
0993  */
0994 struct file *filp_open(const char *filename, int flags, umode_t mode)
0995 {
0996     struct filename *name = getname_kernel(filename);
0997     struct file *file = ERR_CAST(name);
0998     
0999     if (!IS_ERR(name)) {
1000         file = file_open_name(name, flags, mode);
1001         putname(name);
1002     }
1003     return file;
1004 }
1005 EXPORT_SYMBOL(filp_open);
1006 
1007 struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
1008                 const char *filename, int flags, umode_t mode)
1009 {
1010     struct open_flags op;
1011     int err = build_open_flags(flags, mode, &op);
1012     if (err)
1013         return ERR_PTR(err);
1014     return do_file_open_root(dentry, mnt, filename, &op);
1015 }
1016 EXPORT_SYMBOL(file_open_root);
1017 
1018 struct file *filp_clone_open(struct file *oldfile)
1019 {
1020     struct file *file;
1021     int retval;
1022 
1023     file = get_empty_filp();
1024     if (IS_ERR(file))
1025         return file;
1026 
1027     file->f_flags = oldfile->f_flags;
1028     retval = vfs_open(&oldfile->f_path, file, oldfile->f_cred);
1029     if (retval) {
1030         put_filp(file);
1031         return ERR_PTR(retval);
1032     }
1033 
1034     return file;
1035 }
1036 EXPORT_SYMBOL(filp_clone_open);
1037 
1038 long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
1039 {
1040     struct open_flags op;
1041     int fd = build_open_flags(flags, mode, &op);
1042     struct filename *tmp;
1043 
1044     if (fd)
1045         return fd;
1046 
1047     tmp = getname(filename);
1048     if (IS_ERR(tmp))
1049         return PTR_ERR(tmp);
1050 
1051     fd = get_unused_fd_flags(flags);
1052     if (fd >= 0) {
1053         struct file *f = do_filp_open(dfd, tmp, &op);
1054         if (IS_ERR(f)) {
1055             put_unused_fd(fd);
1056             fd = PTR_ERR(f);
1057         } else {
1058             fsnotify_open(f);
1059             fd_install(fd, f);
1060         }
1061     }
1062     putname(tmp);
1063     return fd;
1064 }
1065 
1066 SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
1067 {
1068     if (force_o_largefile())
1069         flags |= O_LARGEFILE;
1070 
1071     return do_sys_open(AT_FDCWD, filename, flags, mode);
1072 }
1073 
1074 SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
1075         umode_t, mode)
1076 {
1077     if (force_o_largefile())
1078         flags |= O_LARGEFILE;
1079 
1080     return do_sys_open(dfd, filename, flags, mode);
1081 }
1082 
1083 #ifndef __alpha__
1084 
1085 /*
1086  * For backward compatibility?  Maybe this should be moved
1087  * into arch/i386 instead?
1088  */
1089 SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode)
1090 {
1091     return sys_open(pathname, O_CREAT | O_WRONLY | O_TRUNC, mode);
1092 }
1093 
1094 #endif
1095 
1096 /*
1097  * "id" is the POSIX thread ID. We use the
1098  * files pointer for this..
1099  */
1100 int filp_close(struct file *filp, fl_owner_t id)
1101 {
1102     int retval = 0;
1103 
1104     if (!file_count(filp)) {
1105         printk(KERN_ERR "VFS: Close: file count is 0\n");
1106         return 0;
1107     }
1108 
1109     if (filp->f_op->flush)
1110         retval = filp->f_op->flush(filp, id);
1111 
1112     if (likely(!(filp->f_mode & FMODE_PATH))) {
1113         dnotify_flush(filp, id);
1114         locks_remove_posix(filp, id);
1115     }
1116     fput(filp);
1117     return retval;
1118 }
1119 
1120 EXPORT_SYMBOL(filp_close);
1121 
1122 /*
1123  * Careful here! We test whether the file pointer is NULL before
1124  * releasing the fd. This ensures that one clone task can't release
1125  * an fd while another clone is opening it.
1126  */
1127 SYSCALL_DEFINE1(close, unsigned int, fd)
1128 {
1129     int retval = __close_fd(current->files, fd);
1130 
1131     /* can't restart close syscall because file table entry was cleared */
1132     if (unlikely(retval == -ERESTARTSYS ||
1133              retval == -ERESTARTNOINTR ||
1134              retval == -ERESTARTNOHAND ||
1135              retval == -ERESTART_RESTARTBLOCK))
1136         retval = -EINTR;
1137 
1138     return retval;
1139 }
1140 EXPORT_SYMBOL(sys_close);
1141 
1142 /*
1143  * This routine simulates a hangup on the tty, to arrange that users
1144  * are given clean terminals at login time.
1145  */
1146 SYSCALL_DEFINE0(vhangup)
1147 {
1148     if (capable(CAP_SYS_TTY_CONFIG)) {
1149         tty_vhangup_self();
1150         return 0;
1151     }
1152     return -EPERM;
1153 }
1154 
1155 /*
1156  * Called when an inode is about to be open.
1157  * We use this to disallow opening large files on 32bit systems if
1158  * the caller didn't specify O_LARGEFILE.  On 64bit systems we force
1159  * on this flag in sys_open.
1160  */
1161 int generic_file_open(struct inode * inode, struct file * filp)
1162 {
1163     if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
1164         return -EOVERFLOW;
1165     return 0;
1166 }
1167 
1168 EXPORT_SYMBOL(generic_file_open);
1169 
1170 /*
1171  * This is used by subsystems that don't want seekable
1172  * file descriptors. The function is not supposed to ever fail, the only
1173  * reason it returns an 'int' and not 'void' is so that it can be plugged
1174  * directly into file_operations structure.
1175  */
1176 int nonseekable_open(struct inode *inode, struct file *filp)
1177 {
1178     filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1179     return 0;
1180 }
1181 
1182 EXPORT_SYMBOL(nonseekable_open);