Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Author: Andrei Vagin <avagin@openvz.org>
0004  * Author: Dmitry Safonov <dima@arista.com>
0005  */
0006 
0007 #include <linux/time_namespace.h>
0008 #include <linux/user_namespace.h>
0009 #include <linux/sched/signal.h>
0010 #include <linux/sched/task.h>
0011 #include <linux/clocksource.h>
0012 #include <linux/seq_file.h>
0013 #include <linux/proc_ns.h>
0014 #include <linux/export.h>
0015 #include <linux/time.h>
0016 #include <linux/slab.h>
0017 #include <linux/cred.h>
0018 #include <linux/err.h>
0019 #include <linux/mm.h>
0020 
0021 #include <vdso/datapage.h>
0022 
0023 ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim,
0024                 struct timens_offsets *ns_offsets)
0025 {
0026     ktime_t offset;
0027 
0028     switch (clockid) {
0029     case CLOCK_MONOTONIC:
0030         offset = timespec64_to_ktime(ns_offsets->monotonic);
0031         break;
0032     case CLOCK_BOOTTIME:
0033     case CLOCK_BOOTTIME_ALARM:
0034         offset = timespec64_to_ktime(ns_offsets->boottime);
0035         break;
0036     default:
0037         return tim;
0038     }
0039 
0040     /*
0041      * Check that @tim value is in [offset, KTIME_MAX + offset]
0042      * and subtract offset.
0043      */
0044     if (tim < offset) {
0045         /*
0046          * User can specify @tim *absolute* value - if it's lesser than
0047          * the time namespace's offset - it's already expired.
0048          */
0049         tim = 0;
0050     } else {
0051         tim = ktime_sub(tim, offset);
0052         if (unlikely(tim > KTIME_MAX))
0053             tim = KTIME_MAX;
0054     }
0055 
0056     return tim;
0057 }
0058 
0059 static struct ucounts *inc_time_namespaces(struct user_namespace *ns)
0060 {
0061     return inc_ucount(ns, current_euid(), UCOUNT_TIME_NAMESPACES);
0062 }
0063 
0064 static void dec_time_namespaces(struct ucounts *ucounts)
0065 {
0066     dec_ucount(ucounts, UCOUNT_TIME_NAMESPACES);
0067 }
0068 
0069 /**
0070  * clone_time_ns - Clone a time namespace
0071  * @user_ns:    User namespace which owns a new namespace.
0072  * @old_ns: Namespace to clone
0073  *
0074  * Clone @old_ns and set the clone refcount to 1
0075  *
0076  * Return: The new namespace or ERR_PTR.
0077  */
0078 static struct time_namespace *clone_time_ns(struct user_namespace *user_ns,
0079                       struct time_namespace *old_ns)
0080 {
0081     struct time_namespace *ns;
0082     struct ucounts *ucounts;
0083     int err;
0084 
0085     err = -ENOSPC;
0086     ucounts = inc_time_namespaces(user_ns);
0087     if (!ucounts)
0088         goto fail;
0089 
0090     err = -ENOMEM;
0091     ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT);
0092     if (!ns)
0093         goto fail_dec;
0094 
0095     refcount_set(&ns->ns.count, 1);
0096 
0097     ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
0098     if (!ns->vvar_page)
0099         goto fail_free;
0100 
0101     err = ns_alloc_inum(&ns->ns);
0102     if (err)
0103         goto fail_free_page;
0104 
0105     ns->ucounts = ucounts;
0106     ns->ns.ops = &timens_operations;
0107     ns->user_ns = get_user_ns(user_ns);
0108     ns->offsets = old_ns->offsets;
0109     ns->frozen_offsets = false;
0110     return ns;
0111 
0112 fail_free_page:
0113     __free_page(ns->vvar_page);
0114 fail_free:
0115     kfree(ns);
0116 fail_dec:
0117     dec_time_namespaces(ucounts);
0118 fail:
0119     return ERR_PTR(err);
0120 }
0121 
0122 /**
0123  * copy_time_ns - Create timens_for_children from @old_ns
0124  * @flags:  Cloning flags
0125  * @user_ns:    User namespace which owns a new namespace.
0126  * @old_ns: Namespace to clone
0127  *
0128  * If CLONE_NEWTIME specified in @flags, creates a new timens_for_children;
0129  * adds a refcounter to @old_ns otherwise.
0130  *
0131  * Return: timens_for_children namespace or ERR_PTR.
0132  */
0133 struct time_namespace *copy_time_ns(unsigned long flags,
0134     struct user_namespace *user_ns, struct time_namespace *old_ns)
0135 {
0136     if (!(flags & CLONE_NEWTIME))
0137         return get_time_ns(old_ns);
0138 
0139     return clone_time_ns(user_ns, old_ns);
0140 }
0141 
0142 static struct timens_offset offset_from_ts(struct timespec64 off)
0143 {
0144     struct timens_offset ret;
0145 
0146     ret.sec = off.tv_sec;
0147     ret.nsec = off.tv_nsec;
0148 
0149     return ret;
0150 }
0151 
0152 /*
0153  * A time namespace VVAR page has the same layout as the VVAR page which
0154  * contains the system wide VDSO data.
0155  *
0156  * For a normal task the VVAR pages are installed in the normal ordering:
0157  *     VVAR
0158  *     PVCLOCK
0159  *     HVCLOCK
0160  *     TIMENS   <- Not really required
0161  *
0162  * Now for a timens task the pages are installed in the following order:
0163  *     TIMENS
0164  *     PVCLOCK
0165  *     HVCLOCK
0166  *     VVAR
0167  *
0168  * The check for vdso_data->clock_mode is in the unlikely path of
0169  * the seq begin magic. So for the non-timens case most of the time
0170  * 'seq' is even, so the branch is not taken.
0171  *
0172  * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check
0173  * for vdso_data->clock_mode is a non-issue. The task is spin waiting for the
0174  * update to finish and for 'seq' to become even anyway.
0175  *
0176  * Timens page has vdso_data->clock_mode set to VDSO_CLOCKMODE_TIMENS which
0177  * enforces the time namespace handling path.
0178  */
0179 static void timens_setup_vdso_data(struct vdso_data *vdata,
0180                    struct time_namespace *ns)
0181 {
0182     struct timens_offset *offset = vdata->offset;
0183     struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic);
0184     struct timens_offset boottime = offset_from_ts(ns->offsets.boottime);
0185 
0186     vdata->seq          = 1;
0187     vdata->clock_mode       = VDSO_CLOCKMODE_TIMENS;
0188     offset[CLOCK_MONOTONIC]     = monotonic;
0189     offset[CLOCK_MONOTONIC_RAW] = monotonic;
0190     offset[CLOCK_MONOTONIC_COARSE]  = monotonic;
0191     offset[CLOCK_BOOTTIME]      = boottime;
0192     offset[CLOCK_BOOTTIME_ALARM]    = boottime;
0193 }
0194 
0195 /*
0196  * Protects possibly multiple offsets writers racing each other
0197  * and tasks entering the namespace.
0198  */
0199 static DEFINE_MUTEX(offset_lock);
0200 
0201 static void timens_set_vvar_page(struct task_struct *task,
0202                 struct time_namespace *ns)
0203 {
0204     struct vdso_data *vdata;
0205     unsigned int i;
0206 
0207     if (ns == &init_time_ns)
0208         return;
0209 
0210     /* Fast-path, taken by every task in namespace except the first. */
0211     if (likely(ns->frozen_offsets))
0212         return;
0213 
0214     mutex_lock(&offset_lock);
0215     /* Nothing to-do: vvar_page has been already initialized. */
0216     if (ns->frozen_offsets)
0217         goto out;
0218 
0219     ns->frozen_offsets = true;
0220     vdata = arch_get_vdso_data(page_address(ns->vvar_page));
0221 
0222     for (i = 0; i < CS_BASES; i++)
0223         timens_setup_vdso_data(&vdata[i], ns);
0224 
0225 out:
0226     mutex_unlock(&offset_lock);
0227 }
0228 
0229 void free_time_ns(struct time_namespace *ns)
0230 {
0231     dec_time_namespaces(ns->ucounts);
0232     put_user_ns(ns->user_ns);
0233     ns_free_inum(&ns->ns);
0234     __free_page(ns->vvar_page);
0235     kfree(ns);
0236 }
0237 
0238 static struct time_namespace *to_time_ns(struct ns_common *ns)
0239 {
0240     return container_of(ns, struct time_namespace, ns);
0241 }
0242 
0243 static struct ns_common *timens_get(struct task_struct *task)
0244 {
0245     struct time_namespace *ns = NULL;
0246     struct nsproxy *nsproxy;
0247 
0248     task_lock(task);
0249     nsproxy = task->nsproxy;
0250     if (nsproxy) {
0251         ns = nsproxy->time_ns;
0252         get_time_ns(ns);
0253     }
0254     task_unlock(task);
0255 
0256     return ns ? &ns->ns : NULL;
0257 }
0258 
0259 static struct ns_common *timens_for_children_get(struct task_struct *task)
0260 {
0261     struct time_namespace *ns = NULL;
0262     struct nsproxy *nsproxy;
0263 
0264     task_lock(task);
0265     nsproxy = task->nsproxy;
0266     if (nsproxy) {
0267         ns = nsproxy->time_ns_for_children;
0268         get_time_ns(ns);
0269     }
0270     task_unlock(task);
0271 
0272     return ns ? &ns->ns : NULL;
0273 }
0274 
0275 static void timens_put(struct ns_common *ns)
0276 {
0277     put_time_ns(to_time_ns(ns));
0278 }
0279 
0280 void timens_commit(struct task_struct *tsk, struct time_namespace *ns)
0281 {
0282     timens_set_vvar_page(tsk, ns);
0283     vdso_join_timens(tsk, ns);
0284 }
0285 
0286 static int timens_install(struct nsset *nsset, struct ns_common *new)
0287 {
0288     struct nsproxy *nsproxy = nsset->nsproxy;
0289     struct time_namespace *ns = to_time_ns(new);
0290 
0291     if (!current_is_single_threaded())
0292         return -EUSERS;
0293 
0294     if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
0295         !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
0296         return -EPERM;
0297 
0298     get_time_ns(ns);
0299     put_time_ns(nsproxy->time_ns);
0300     nsproxy->time_ns = ns;
0301 
0302     get_time_ns(ns);
0303     put_time_ns(nsproxy->time_ns_for_children);
0304     nsproxy->time_ns_for_children = ns;
0305     return 0;
0306 }
0307 
0308 void timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk)
0309 {
0310     struct ns_common *nsc = &nsproxy->time_ns_for_children->ns;
0311     struct time_namespace *ns = to_time_ns(nsc);
0312 
0313     /* create_new_namespaces() already incremented the ref counter */
0314     if (nsproxy->time_ns == nsproxy->time_ns_for_children)
0315         return;
0316 
0317     get_time_ns(ns);
0318     put_time_ns(nsproxy->time_ns);
0319     nsproxy->time_ns = ns;
0320 
0321     timens_commit(tsk, ns);
0322 }
0323 
0324 static struct user_namespace *timens_owner(struct ns_common *ns)
0325 {
0326     return to_time_ns(ns)->user_ns;
0327 }
0328 
0329 static void show_offset(struct seq_file *m, int clockid, struct timespec64 *ts)
0330 {
0331     char *clock;
0332 
0333     switch (clockid) {
0334     case CLOCK_BOOTTIME:
0335         clock = "boottime";
0336         break;
0337     case CLOCK_MONOTONIC:
0338         clock = "monotonic";
0339         break;
0340     default:
0341         clock = "unknown";
0342         break;
0343     }
0344     seq_printf(m, "%-10s %10lld %9ld\n", clock, ts->tv_sec, ts->tv_nsec);
0345 }
0346 
0347 void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m)
0348 {
0349     struct ns_common *ns;
0350     struct time_namespace *time_ns;
0351 
0352     ns = timens_for_children_get(p);
0353     if (!ns)
0354         return;
0355     time_ns = to_time_ns(ns);
0356 
0357     show_offset(m, CLOCK_MONOTONIC, &time_ns->offsets.monotonic);
0358     show_offset(m, CLOCK_BOOTTIME, &time_ns->offsets.boottime);
0359     put_time_ns(time_ns);
0360 }
0361 
0362 int proc_timens_set_offset(struct file *file, struct task_struct *p,
0363                struct proc_timens_offset *offsets, int noffsets)
0364 {
0365     struct ns_common *ns;
0366     struct time_namespace *time_ns;
0367     struct timespec64 tp;
0368     int i, err;
0369 
0370     ns = timens_for_children_get(p);
0371     if (!ns)
0372         return -ESRCH;
0373     time_ns = to_time_ns(ns);
0374 
0375     if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME)) {
0376         put_time_ns(time_ns);
0377         return -EPERM;
0378     }
0379 
0380     for (i = 0; i < noffsets; i++) {
0381         struct proc_timens_offset *off = &offsets[i];
0382 
0383         switch (off->clockid) {
0384         case CLOCK_MONOTONIC:
0385             ktime_get_ts64(&tp);
0386             break;
0387         case CLOCK_BOOTTIME:
0388             ktime_get_boottime_ts64(&tp);
0389             break;
0390         default:
0391             err = -EINVAL;
0392             goto out;
0393         }
0394 
0395         err = -ERANGE;
0396 
0397         if (off->val.tv_sec > KTIME_SEC_MAX ||
0398             off->val.tv_sec < -KTIME_SEC_MAX)
0399             goto out;
0400 
0401         tp = timespec64_add(tp, off->val);
0402         /*
0403          * KTIME_SEC_MAX is divided by 2 to be sure that KTIME_MAX is
0404          * still unreachable.
0405          */
0406         if (tp.tv_sec < 0 || tp.tv_sec > KTIME_SEC_MAX / 2)
0407             goto out;
0408     }
0409 
0410     mutex_lock(&offset_lock);
0411     if (time_ns->frozen_offsets) {
0412         err = -EACCES;
0413         goto out_unlock;
0414     }
0415 
0416     err = 0;
0417     /* Don't report errors after this line */
0418     for (i = 0; i < noffsets; i++) {
0419         struct proc_timens_offset *off = &offsets[i];
0420         struct timespec64 *offset = NULL;
0421 
0422         switch (off->clockid) {
0423         case CLOCK_MONOTONIC:
0424             offset = &time_ns->offsets.monotonic;
0425             break;
0426         case CLOCK_BOOTTIME:
0427             offset = &time_ns->offsets.boottime;
0428             break;
0429         }
0430 
0431         *offset = off->val;
0432     }
0433 
0434 out_unlock:
0435     mutex_unlock(&offset_lock);
0436 out:
0437     put_time_ns(time_ns);
0438 
0439     return err;
0440 }
0441 
0442 const struct proc_ns_operations timens_operations = {
0443     .name       = "time",
0444     .type       = CLONE_NEWTIME,
0445     .get        = timens_get,
0446     .put        = timens_put,
0447     .install    = timens_install,
0448     .owner      = timens_owner,
0449 };
0450 
0451 const struct proc_ns_operations timens_for_children_operations = {
0452     .name       = "time_for_children",
0453     .real_ns_name   = "time",
0454     .type       = CLONE_NEWTIME,
0455     .get        = timens_for_children_get,
0456     .put        = timens_put,
0457     .install    = timens_install,
0458     .owner      = timens_owner,
0459 };
0460 
0461 struct time_namespace init_time_ns = {
0462     .ns.count   = REFCOUNT_INIT(3),
0463     .user_ns    = &init_user_ns,
0464     .ns.inum    = PROC_TIME_INIT_INO,
0465     .ns.ops     = &timens_operations,
0466     .frozen_offsets = true,
0467 };