lib/daemon-unix.c

   1 /*
   2  * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at:
   7  *
   8  *     http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include <config.h>
  18 #include "daemon.h"
  19 #include "daemon-private.h"
  20 #include <errno.h>
  21 #include <fcntl.h>
  22 #include <grp.h>
  23 #include <pwd.h>
  24 #include <signal.h>
  25 #include <stdlib.h>
  26 #include <string.h>
  27 #include <sys/resource.h>
  28 #include <sys/wait.h>
  29 #include <sys/stat.h>
  30 #include <unistd.h>
  31 #if HAVE_LIBCAPNG
  32 #include <cap-ng.h>
  33 #endif
  34 #include "command-line.h"
  35 #include "fatal-signal.h"
  36 #include "dirs.h"
  37 #include "lockfile.h"
  38 #include "ovs-thread.h"
  39 #include "process.h"
  40 #include "socket-util.h"
  41 #include "timeval.h"
  42 #include "util.h"
  43 #include "openvswitch/vlog.h"
  44
  45 VLOG_DEFINE_THIS_MODULE(daemon_unix);
  46
  47 #ifdef __linux__
  48 #define LINUX 1
  49 #else
  50 #define LINUX 0
  51 #endif
  52
  53 #if HAVE_LIBCAPNG
  54 #define LIBCAPNG 1
  55 #else
  56 #define LIBCAPNG 0
  57 #endif
  58
  59 /* --detach: Should we run in the background? */
  60 bool detach;                    /* Was --detach specified? */
  61 static bool detached;           /* Have we already detached? */
  62
  63 /* --pidfile: Name of pidfile (null if none). */
  64 char *pidfile;
  65
  66 /* Device and inode of pidfile, so we can avoid reopening it. */
  67 static dev_t pidfile_dev;
  68 static ino_t pidfile_ino;
  69
  70 /* --overwrite-pidfile: Create pidfile even if one already exists and is
  71    locked? */
  72 static bool overwrite_pidfile;
  73
  74 /* --no-chdir: Should we chdir to "/"? */
  75 static bool chdir_ = true;
  76
  77 /* File descriptor used by daemonize_start() and daemonize_complete(). */
  78 static int daemonize_fd = -1;
  79
  80 /* --monitor: Should a supervisory process monitor the daemon and restart it if
  81  * it dies due to an error signal? */
  82 static bool monitor;
  83
  84 /* --user: Only root can use this option. Switch to new uid:gid after
  85  * initially running as root.  */
  86 static bool switch_user = false;
  87 static bool non_root_user = false;
  88 static uid_t uid;
  89 static gid_t gid;
  90 static char *user = NULL;
  91 static void daemon_become_new_user__(bool access_datapath);
  92
  93 static void check_already_running(void);
  94 static int lock_pidfile(FILE *, int command);
  95 static pid_t fork_and_clean_up(void);
  96 static void daemonize_post_detach(void);
  97
  98 /* Returns the file name that would be used for a pidfile if 'name' were
  99  * provided to set_pidfile().  The caller must free the returned string. */
 100 char *
 101 make_pidfile_name(const char *name)
 102 {
 103     return (!name
 104             ? xasprintf("%s/%s.pid", ovs_rundir(), program_name)
 105             : abs_file_name(ovs_rundir(), name));
 106 }
 107
 108 /* Sets that we do not chdir to "/". */
 109 void
 110 set_no_chdir(void)
 111 {
 112     chdir_ = false;
 113 }
 114
 115 /* Normally, daemonize() or damonize_start() will terminate the program with a
 116  * message if a locked pidfile already exists.  If this function is called, an
 117  * existing pidfile will be replaced, with a warning. */
 118 void
 119 ignore_existing_pidfile(void)
 120 {
 121     overwrite_pidfile = true;
 122 }
 123
 124 /* Sets up a following call to daemonize() to detach from the foreground
 125  * session, running this process in the background.  */
 126 void
 127 set_detach(void)
 128 {
 129     detach = true;
 130 }
 131
 132 /* Sets up a following call to daemonize() to fork a supervisory process to
 133  * monitor the daemon and restart it if it dies due to an error signal.  */
 134 void
 135 daemon_set_monitor(void)
 136 {
 137     monitor = true;
 138 }
 139
 140 /* If a pidfile has been configured, creates it and stores the running
 141  * process's pid in it.  Ensures that the pidfile will be deleted when the
 142  * process exits. */
 143 static void
 144 make_pidfile(void)
 145 {
 146     long int pid = getpid();
 147     struct stat s;
 148     char *tmpfile;
 149     FILE *file;
 150     int error;
 151
 152     /* Create a temporary pidfile. */
 153     if (overwrite_pidfile) {
 154         tmpfile = xasprintf("%s.tmp%ld", pidfile, pid);
 155         fatal_signal_add_file_to_unlink(tmpfile);
 156     } else {
 157         /* Everyone shares the same file which will be treated as a lock.  To
 158          * avoid some uncomfortable race conditions, we can't set up the fatal
 159          * signal unlink until we've acquired it. */
 160         tmpfile = xasprintf("%s.tmp", pidfile);
 161     }
 162
 163     file = fopen(tmpfile, "a+");
 164     if (!file) {
 165         VLOG_FATAL("%s: create failed (%s)", tmpfile, ovs_strerror(errno));
 166     }
 167
 168     error = lock_pidfile(file, F_SETLK);
 169     if (error) {
 170         /* Looks like we failed to acquire the lock.  Note that, if we failed
 171          * for some other reason (and '!overwrite_pidfile'), we will have
 172          * left 'tmpfile' as garbage in the file system. */
 173         VLOG_FATAL("%s: fcntl(F_SETLK) failed (%s)", tmpfile,
 174                    ovs_strerror(error));
 175     }
 176
 177     if (!overwrite_pidfile) {
 178         /* We acquired the lock.  Make sure to clean up on exit, and verify
 179          * that we're allowed to create the actual pidfile. */
 180         fatal_signal_add_file_to_unlink(tmpfile);
 181         check_already_running();
 182     }
 183
 184     if (fstat(fileno(file), &s) == -1) {
 185         VLOG_FATAL("%s: fstat failed (%s)", tmpfile, ovs_strerror(errno));
 186     }
 187
 188     if (ftruncate(fileno(file), 0) == -1) {
 189         VLOG_FATAL("%s: truncate failed (%s)", tmpfile, ovs_strerror(errno));
 190     }
 191
 192     fprintf(file, "%ld\n", pid);
 193     if (fflush(file) == EOF) {
 194         VLOG_FATAL("%s: write failed (%s)", tmpfile, ovs_strerror(errno));
 195     }
 196
 197     error = rename(tmpfile, pidfile);
 198
 199     /* Due to a race, 'tmpfile' may be owned by a different process, so we
 200      * shouldn't delete it on exit. */
 201     fatal_signal_remove_file_to_unlink(tmpfile);
 202
 203     if (error < 0) {
 204         VLOG_FATAL("failed to rename \"%s\" to \"%s\" (%s)",
 205                    tmpfile, pidfile, ovs_strerror(errno));
 206     }
 207
 208     /* Ensure that the pidfile will get deleted on exit. */
 209     fatal_signal_add_file_to_unlink(pidfile);
 210
 211     /* Clean up.
 212      *
 213      * We don't close 'file' because its file descriptor must remain open to
 214      * hold the lock. */
 215     pidfile_dev = s.st_dev;
 216     pidfile_ino = s.st_ino;
 217     free(tmpfile);
 218 }
 219
 220 /* Calls fork() and on success returns its return value.  On failure, logs an
 221  * error and exits unsuccessfully.
 222  *
 223  * Post-fork, but before returning, this function calls a few other functions
 224  * that are generally useful if the child isn't planning to exec a new
 225  * process. */
 226 static pid_t
 227 fork_and_clean_up(void)
 228 {
 229     pid_t pid = xfork();
 230     if (pid > 0) {
 231         /* Running in parent process. */
 232         fatal_signal_fork();
 233     } else if (!pid) {
 234         /* Running in child process. */
 235         lockfile_postfork();
 236     }
 237     return pid;
 238 }
 239
 240 /* Forks, then:
 241  *
 242  *   - In the parent, waits for the child to signal that it has completed its
 243  *     startup sequence.  Then stores -1 in '*fdp' and returns the child's
 244  *     pid in '*child_pid' argument.
 245  *
 246  *   - In the child, stores a fd in '*fdp' and returns 0 through '*child_pid'
 247  *     argument.  The caller should pass the fd to fork_notify_startup() after
 248  *     it finishes its startup sequence.
 249  *
 250  * Returns 0 on success.  If something goes wrong and child process was not
 251  * able to signal its readiness by calling fork_notify_startup(), then this
 252  * function returns -1. However, even in case of failure it still sets child
 253  * process id in '*child_pid'. */
 254 static int
 255 fork_and_wait_for_startup(int *fdp, pid_t *child_pid)
 256 {
 257     int fds[2];
 258     pid_t pid;
 259     int ret = 0;
 260
 261     xpipe(fds);
 262
 263     pid = fork_and_clean_up();
 264     if (pid > 0) {
 265         /* Running in parent process. */
 266         size_t bytes_read;
 267         char c;
 268
 269         close(fds[1]);
 270         if (read_fully(fds[0], &c, 1, &bytes_read) != 0) {
 271             int retval;
 272             int status;
 273
 274             do {
 275                 retval = waitpid(pid, &status, 0);
 276             } while (retval == -1 && errno == EINTR);
 277
 278             if (retval == pid) {
 279                 if (WIFEXITED(status) && WEXITSTATUS(status)) {
 280                     /* Child exited with an error.  Convey the same error
 281                      * to our parent process as a courtesy. */
 282                     exit(WEXITSTATUS(status));
 283                 } else {
 284                     char *status_msg = process_status_msg(status);
 285                     VLOG_ERR("fork child died before signaling startup (%s)",
 286                              status_msg);
 287                     ret = -1;
 288                 }
 289             } else if (retval < 0) {
 290                 VLOG_FATAL("waitpid failed (%s)", ovs_strerror(errno));
 291             } else {
 292                 OVS_NOT_REACHED();
 293             }
 294         }
 295         close(fds[0]);
 296         *fdp = -1;
 297     } else if (!pid) {
 298         /* Running in child process. */
 299         close(fds[0]);
 300         *fdp = fds[1];
 301     }
 302     *child_pid = pid;
 303     return ret;
 304 }
 305
 306 static void
 307 fork_notify_startup(int fd)
 308 {
 309     if (fd != -1) {
 310         size_t bytes_written;
 311         int error;
 312
 313         error = write_fully(fd, "", 1, &bytes_written);
 314         if (error) {
 315             VLOG_FATAL("pipe write failed (%s)", ovs_strerror(error));
 316         }
 317
 318         close(fd);
 319     }
 320 }
 321
 322 static bool
 323 should_restart(int status)
 324 {
 325     if (WIFSIGNALED(status)) {
 326         static const int error_signals[] = {
 327             /* This list of signals is documented in daemon.man.  If you
 328              * change the list, update the documentation too. */
 329             SIGABRT, SIGALRM, SIGBUS, SIGFPE, SIGILL, SIGPIPE, SIGSEGV,
 330             SIGXCPU, SIGXFSZ
 331         };
 332
 333         size_t i;
 334
 335         for (i = 0; i < ARRAY_SIZE(error_signals); i++) {
 336             if (error_signals[i] == WTERMSIG(status)) {
 337                 return true;
 338             }
 339         }
 340     }
 341     return false;
 342 }
 343
 344 static void
 345 monitor_daemon(pid_t daemon_pid)
 346 {
 347     /* XXX Should log daemon's stderr output at startup time. */
 348     time_t last_restart;
 349     char *status_msg;
 350     int crashes;
 351     bool child_ready = true;
 352
 353     set_subprogram_name("monitor");
 354     status_msg = xstrdup("healthy");
 355     last_restart = TIME_MIN;
 356     crashes = 0;
 357     for (;;) {
 358         int retval;
 359         int status;
 360
 361         ovs_cmdl_proctitle_set("monitoring pid %lu (%s)",
 362                                (unsigned long int) daemon_pid, status_msg);
 363
 364         if (child_ready) {
 365             do {
 366                 retval = waitpid(daemon_pid, &status, 0);
 367             } while (retval == -1 && errno == EINTR);
 368             if (retval == -1) {
 369                 VLOG_FATAL("waitpid failed (%s)", ovs_strerror(errno));
 370             }
 371         }
 372
 373         if (!child_ready || retval == daemon_pid) {
 374             char *s = process_status_msg(status);
 375             if (should_restart(status)) {
 376                 free(status_msg);
 377                 status_msg = xasprintf("%d crashes: pid %lu died, %s",
 378                                        ++crashes,
 379                                        (unsigned long int) daemon_pid, s);
 380                 free(s);
 381
 382                 if (WCOREDUMP(status)) {
 383                     /* Disable further core dumps to save disk space. */
 384                     struct rlimit r;
 385
 386                     r.rlim_cur = 0;
 387                     r.rlim_max = 0;
 388                     if (setrlimit(RLIMIT_CORE, &r) == -1) {
 389                         VLOG_WARN("failed to disable core dumps: %s",
 390                                   ovs_strerror(errno));
 391                     }
 392                 }
 393
 394                 /* Throttle restarts to no more than once every 10 seconds. */
 395                 if (time(NULL) < last_restart + 10) {
 396                     VLOG_WARN("%s, waiting until 10 seconds since last "
 397                               "restart", status_msg);
 398                     for (;;) {
 399                         time_t now = time(NULL);
 400                         time_t wakeup = last_restart + 10;
 401                         if (now >= wakeup) {
 402                             break;
 403                         }
 404                         xsleep(wakeup - now);
 405                     }
 406                 }
 407                 last_restart = time(NULL);
 408
 409                 VLOG_ERR("%s, restarting", status_msg);
 410                 child_ready = !fork_and_wait_for_startup(&daemonize_fd,
 411                                                          &daemon_pid);
 412                 if (child_ready && !daemon_pid) {
 413                     /* Child process needs to break out of monitoring
 414                      * loop. */
 415                     break;
 416                 }
 417             } else {
 418                 VLOG_INFO("pid %lu died, %s, exiting",
 419                           (unsigned long int) daemon_pid, s);
 420                 free(s);
 421                 exit(0);
 422             }
 423         }
 424     }
 425     free(status_msg);
 426
 427     /* Running in new daemon process. */
 428     ovs_cmdl_proctitle_restore();
 429     set_subprogram_name("");
 430 }
 431
 432 /* If daemonization is configured, then starts daemonization, by forking and
 433  * returning in the child process.  The parent process hangs around until the
 434  * child lets it know either that it completed startup successfully (by calling
 435  * daemon_complete()) or that it failed to start up (by exiting with a nonzero
 436  * exit code). */
 437 void
 438 daemonize_start(bool access_datapath)
 439 {
 440     assert_single_threaded();
 441     daemonize_fd = -1;
 442
 443     if (switch_user) {
 444         daemon_become_new_user__(access_datapath);
 445         switch_user = false;
 446     }
 447
 448     /* If --user is specified, make sure user switch has completed by now.  */
 449     if (non_root_user) {
 450         ovs_assert(geteuid() && getuid());
 451     }
 452
 453     if (detach) {
 454         pid_t pid;
 455
 456         if (fork_and_wait_for_startup(&daemonize_fd, &pid)) {
 457             VLOG_FATAL("could not detach from foreground session");
 458         }
 459         if (pid > 0) {
 460             /* Running in parent process. */
 461             exit(0);
 462         }
 463
 464         /* Running in daemon or monitor process. */
 465         setsid();
 466     }
 467
 468     if (monitor) {
 469         int saved_daemonize_fd = daemonize_fd;
 470         pid_t daemon_pid;
 471
 472         if (fork_and_wait_for_startup(&daemonize_fd, &daemon_pid)) {
 473             VLOG_FATAL("could not initiate process monitoring");
 474         }
 475         if (daemon_pid > 0) {
 476             /* Running in monitor process. */
 477             fork_notify_startup(saved_daemonize_fd);
 478             close_standard_fds();
 479             monitor_daemon(daemon_pid);
 480         }
 481         /* Running in daemon process. */
 482     }
 483
 484     forbid_forking("running in daemon process");
 485
 486     if (pidfile) {
 487         make_pidfile();
 488     }
 489
 490     /* Make sure that the unixctl commands for vlog get registered in a
 491      * daemon, even before the first log message. */
 492     vlog_init();
 493 }
 494
 495 /* If daemonization is configured, then this function notifies the parent
 496  * process that the child process has completed startup successfully.  It also
 497  * call daemonize_post_detach().
 498  *
 499  * Calling this function more than once has no additional effect. */
 500 void
 501 daemonize_complete(void)
 502 {
 503     if (pidfile) {
 504         free(pidfile);
 505         pidfile = NULL;
 506     }
 507
 508     if (!detached) {
 509         detached = true;
 510
 511         fork_notify_startup(daemonize_fd);
 512         daemonize_fd = -1;
 513         daemonize_post_detach();
 514     }
 515 }
 516
 517 /* If daemonization is configured, then this function does traditional Unix
 518  * daemonization behavior: join a new session, chdir to the root (if not
 519  * disabled), and close the standard file descriptors.
 520  *
 521  * It only makes sense to call this function as part of an implementation of a
 522  * special daemon subprocess.  A normal daemon should just call
 523  * daemonize_complete(). */
 524 static void
 525 daemonize_post_detach(void)
 526 {
 527     if (detach) {
 528         if (chdir_) {
 529             ignore(chdir("/"));
 530         }
 531         close_standard_fds();
 532     }
 533 }
 534
 535 void
 536 daemon_usage(void)
 537 {
 538     printf(
 539         "\nDaemon options:\n"
 540         "  --detach                run in background as daemon\n"
 541         "  --no-chdir              do not chdir to '/'\n"
 542         "  --pidfile[=FILE]        create pidfile (default: %s/%s.pid)\n"
 543         "  --overwrite-pidfile     with --pidfile, start even if already "
 544                                    "running\n",
 545         ovs_rundir(), program_name);
 546 }
 547
 548 static int
 549 lock_pidfile__(FILE *file, int command, struct flock *lck)
 550 {
 551     int error;
 552
 553     lck->l_type = F_WRLCK;
 554     lck->l_whence = SEEK_SET;
 555     lck->l_start = 0;
 556     lck->l_len = 0;
 557     lck->l_pid = 0;
 558
 559     do {
 560         error = fcntl(fileno(file), command, lck) == -1 ? errno : 0;
 561     } while (error == EINTR);
 562     return error;
 563 }
 564
 565 static int
 566 lock_pidfile(FILE *file, int command)
 567 {
 568     struct flock lck;
 569
 570     return lock_pidfile__(file, command, &lck);
 571 }
 572
 573 static pid_t
 574 read_pidfile__(const char *pidfile, bool delete_if_stale)
 575 {
 576     struct stat s, s2;
 577     struct flock lck;
 578     char line[128];
 579     FILE *file;
 580     int error;
 581
 582     if ((pidfile_ino || pidfile_dev)
 583         && !stat(pidfile, &s)
 584         && s.st_ino == pidfile_ino && s.st_dev == pidfile_dev) {
 585         /* It's our own pidfile.  We can't afford to open it, because closing
 586          * *any* fd for a file that a process has locked also releases all the
 587          * locks on that file.
 588          *
 589          * Fortunately, we know the associated pid anyhow: */
 590         return getpid();
 591     }
 592
 593     file = fopen(pidfile, "r+");
 594     if (!file) {
 595         if (errno == ENOENT && delete_if_stale) {
 596             return 0;
 597         }
 598         error = errno;
 599         VLOG_WARN("%s: open: %s", pidfile, ovs_strerror(error));
 600         goto error;
 601     }
 602
 603     error = lock_pidfile__(file, F_GETLK, &lck);
 604     if (error) {
 605         VLOG_WARN("%s: fcntl: %s", pidfile, ovs_strerror(error));
 606         goto error;
 607     }
 608     if (lck.l_type == F_UNLCK) {
 609         /* pidfile exists but it isn't locked by anyone.  We need to delete it
 610          * so that a new pidfile can go in its place.  But just calling
 611          * unlink(pidfile) makes a nasty race: what if someone else unlinks it
 612          * before we do and then replaces it by a valid pidfile?  We'd unlink
 613          * their valid pidfile.  We do a little dance to avoid the race, by
 614          * locking the invalid pidfile.  Only one process can have the invalid
 615          * pidfile locked, and only that process has the right to unlink it. */
 616         if (!delete_if_stale) {
 617             error = ESRCH;
 618             VLOG_DBG("%s: pid file is stale", pidfile);
 619             goto error;
 620         }
 621
 622         /* Get the lock. */
 623         error = lock_pidfile(file, F_SETLK);
 624         if (error) {
 625             /* We lost a race with someone else doing the same thing. */
 626             VLOG_WARN("%s: lost race to lock pidfile", pidfile);
 627             goto error;
 628         }
 629
 630         /* Is the file we have locked still named 'pidfile'? */
 631         if (stat(pidfile, &s) || fstat(fileno(file), &s2)
 632             || s.st_ino != s2.st_ino || s.st_dev != s2.st_dev) {
 633             /* No.  We lost a race with someone else who got the lock before
 634              * us, deleted the pidfile, and closed it (releasing the lock). */
 635             error = EALREADY;
 636             VLOG_WARN("%s: lost race to delete pidfile", pidfile);
 637             goto error;
 638         }
 639
 640         /* We won the right to delete the stale pidfile. */
 641         if (unlink(pidfile)) {
 642             error = errno;
 643             VLOG_WARN("%s: failed to delete stale pidfile (%s)",
 644                       pidfile, ovs_strerror(error));
 645             goto error;
 646         }
 647         VLOG_DBG("%s: deleted stale pidfile", pidfile);
 648         fclose(file);
 649         return 0;
 650     }
 651
 652     if (!fgets(line, sizeof line, file)) {
 653         if (ferror(file)) {
 654             error = errno;
 655             VLOG_WARN("%s: read: %s", pidfile, ovs_strerror(error));
 656         } else {
 657             error = ESRCH;
 658             VLOG_WARN("%s: read: unexpected end of file", pidfile);
 659         }
 660         goto error;
 661     }
 662
 663     if (lck.l_pid != strtoul(line, NULL, 10)) {
 664         /* The process that has the pidfile locked is not the process that
 665          * created it.  It must be stale, with the process that has it locked
 666          * preparing to delete it. */
 667         error = ESRCH;
 668         VLOG_WARN("%s: stale pidfile for pid %s being deleted by pid %ld",
 669                   pidfile, line, (long int) lck.l_pid);
 670         goto error;
 671     }
 672
 673     fclose(file);
 674     return lck.l_pid;
 675
 676 error:
 677     if (file) {
 678         fclose(file);
 679     }
 680     return -error;
 681 }
 682
 683 /* Opens and reads a PID from 'pidfile'.  Returns the positive PID if
 684  * successful, otherwise a negative errno value. */
 685 pid_t
 686 read_pidfile(const char *pidfile)
 687 {
 688     return read_pidfile__(pidfile, false);
 689 }
 690
 691 /* Checks whether a process with the given 'pidfile' is already running and,
 692  * if so, aborts.  If 'pidfile' is stale, deletes it. */
 693 static void
 694 check_already_running(void)
 695 {
 696     long int pid = read_pidfile__(pidfile, true);
 697     if (pid > 0) {
 698         VLOG_FATAL("%s: already running as pid %ld, aborting", pidfile, pid);
 699     } else if (pid < 0) {
 700         VLOG_FATAL("%s: pidfile check failed (%s), aborting",
 701                    pidfile, ovs_strerror(-pid));
 702     }
 703 }
 704
 705 \f
 706 /* stub functions for non-windows platform. */
 707
 708 void
 709 service_start(int *argc OVS_UNUSED, char **argv[] OVS_UNUSED)
 710 {
 711 }
 712
 713 void
 714 service_stop(void)
 715 {
 716 }
 717
 718 bool
 719 should_service_stop(void)
 720 {
 721     return false;
 722 }
 723
 724 \f
 725 static bool
 726 gid_matches(gid_t expected, gid_t value)
 727 {
 728     return expected == -1 || expected == value;
 729 }
 730
 731 static bool
 732 gid_verify(gid_t real, gid_t effective, gid_t saved)
 733 {
 734     gid_t r, e, s;
 735
 736     return (getresgid(&r, &e, &s) == 0 &&
 737             gid_matches(real, r) &&
 738             gid_matches(effective, e) &&
 739             gid_matches(saved, s));
 740 }
 741
 742 static void
 743 daemon_switch_group(gid_t real, gid_t effective,
 744                     gid_t saved)
 745 {
 746     if ((setresgid(real, effective, saved) == -1) ||
 747         !gid_verify(real, effective, saved)) {
 748         VLOG_FATAL("%s: fail to switch group to gid as %d, aborting",
 749                    pidfile, gid);
 750     }
 751 }
 752
 753 static bool
 754 uid_matches(uid_t expected, uid_t value)
 755 {
 756     return expected == -1 || expected == value;
 757 }
 758
 759 static bool
 760 uid_verify(const uid_t real, const uid_t effective, const uid_t saved)
 761 {
 762     uid_t r, e, s;
 763
 764     return (getresuid(&r, &e, &s) == 0 &&
 765             uid_matches(real, r) &&
 766             uid_matches(effective, e) &&
 767             uid_matches(saved, s));
 768 }
 769
 770 static void
 771 daemon_switch_user(const uid_t real, const uid_t effective, const uid_t saved,
 772                    const char *user)
 773 {
 774     if ((setresuid(real, effective, saved) == -1) ||
 775         !uid_verify(real, effective, saved)) {
 776         VLOG_FATAL("%s: fail to switch user to %s, aborting",
 777                    pidfile, user);
 778     }
 779 }
 780
 781 /* Use portable Unix APIs to switch uid:gid, when datapath
 782  * access is not required.  On Linux systems, all capabilities
 783  * will be dropped.  */
 784 static void
 785 daemon_become_new_user_unix(void)
 786 {
 787     /* "Setuid Demystified" by Hao Chen, etc outlines some caveats of
 788      * around unix system call setuid() and friends. This implementation
 789      * mostly follow the advice given by the paper.  The paper is
 790      * published in 2002, so things could have changed.  */
 791
 792     /* Change both real and effective uid and gid will permanently
 793      * drop the process' privilege.  "Setuid Demystified" suggested
 794      * that calling getuid() after each setuid() call to verify they
 795      * are actually set, because checking return code alone is not
 796      * sufficient.  */
 797     daemon_switch_group(gid, gid, gid);
 798     if (user && initgroups(user, gid) == -1) {
 799         VLOG_FATAL("%s: fail to add supplementary group gid %d, "
 800                    "aborting", pidfile, gid);
 801     }
 802     daemon_switch_user(uid, uid, uid, user);
 803 }
 804
 805 /* Linux specific implementation of daemon_become_new_user()
 806  * using libcap-ng.   */
 807 #if defined __linux__ &&  HAVE_LIBCAPNG
 808 static void
 809 daemon_become_new_user_linux(bool access_datapath)
 810 {
 811     int ret;
 812
 813     ret = capng_get_caps_process();
 814
 815     if (!ret) {
 816         if (capng_have_capabilities(CAPNG_SELECT_CAPS) > CAPNG_NONE) {
 817             const capng_type_t cap_sets = CAPNG_EFFECTIVE|CAPNG_PERMITTED;
 818
 819             capng_clear(CAPNG_SELECT_BOTH);
 820
 821             ret = capng_update(CAPNG_ADD, cap_sets, CAP_IPC_LOCK)
 822                   || capng_update(CAPNG_ADD, cap_sets, CAP_NET_BIND_SERVICE);
 823
 824             if (access_datapath && !ret) {
 825                 ret = capng_update(CAPNG_ADD, cap_sets, CAP_NET_ADMIN)
 826                       || capng_update(CAPNG_ADD, cap_sets, CAP_NET_RAW);
 827             }
 828         } else {
 829             ret = -1;
 830         }
 831     }
 832
 833     if (!ret) {
 834         /* CAPNG_INIT_SUPP_GRP will be a better choice than
 835          * CAPNG_DROP_SUPP_GRP. However this enum value is only defined
 836          * with libcap-ng higher than version 0.7.4, which is not wildly
 837          * available on many Linux distributions yet. Taking a more
 838          * conservative approach to make sure OVS behaves consistently.
 839          *
 840          * XXX We may change this for future OVS releases.
 841          */
 842         ret = capng_change_id(uid, gid, CAPNG_DROP_SUPP_GRP
 843                               | CAPNG_CLEAR_BOUNDING);
 844     }
 845
 846     if (ret) {
 847         VLOG_FATAL("%s: libcap-ng fail to switch to user and group "
 848                    "%d:%d, aborting", pidfile, uid, gid);
 849     }
 850 }
 851 #endif
 852
 853 static void
 854 daemon_become_new_user__(bool access_datapath)
 855 {
 856     if (LINUX) {
 857         if (LIBCAPNG) {
 858             daemon_become_new_user_linux(access_datapath);
 859         } else {
 860             VLOG_FATAL("%s: fail to downgrade user using libcap-ng. "
 861                        "(libcap-ng is not configured at compile time), "
 862                        "aborting.", pidfile);
 863         }
 864     } else {
 865         daemon_become_new_user_unix();
 866     }
 867 }
 868
 869 /* Noramlly, user switch is embedded within daemonize_start().
 870  * However, there in case the user switch needs to be done
 871  * before daemonize_start(), the following API can be used.  */
 872 void
 873 daemon_become_new_user(bool access_datapath)
 874 {
 875     assert_single_threaded();
 876     if (switch_user) {
 877         daemon_become_new_user__(access_datapath);
 878
 879         /* Make sure daemonize_start() will not switch
 880          * user again. */
 881         switch_user = false;
 882     }
 883 }
 884
 885 /* Return the maximun suggested buffer size for both getpwname_r()
 886  * and getgrnam_r().
 887  *
 888  * This size may still not be big enough. in case getpwname_r()
 889  * and friends return ERANGE, a larger buffer should be supplied to
 890  * retry. (The man page did not specify the max size to stop at, we
 891  * will keep trying with doubling the buffer size for each round until
 892  * the size wrapps around size_t.  */
 893 static size_t
 894 get_sysconf_buffer_size(void)
 895 {
 896     size_t bufsize, pwd_bs = 0, grp_bs = 0;
 897     const size_t default_bufsize = 1024;
 898
 899     errno = 0;
 900     if ((pwd_bs = sysconf(_SC_GETPW_R_SIZE_MAX)) == -1) {
 901         if (errno) {
 902             VLOG_FATAL("%s: Read initial passwordd struct size "
 903                        "failed (%s), aborting. ", pidfile,
 904                        ovs_strerror(errno));
 905         }
 906     }
 907
 908     if ((grp_bs = sysconf(_SC_GETGR_R_SIZE_MAX)) == -1) {
 909         if (errno) {
 910             VLOG_FATAL("%s: Read initial group struct size "
 911                        "failed (%s), aborting. ", pidfile,
 912                        ovs_strerror(errno));
 913         }
 914     }
 915
 916     bufsize = MAX(pwd_bs, grp_bs);
 917     return bufsize ? bufsize : default_bufsize;
 918 }
 919
 920 /* Try to double the size of '*buf', return true
 921  * if successful, and '*sizep' will be updated with
 922  * the new size. Otherwise, return false.  */
 923 static bool
 924 enlarge_buffer(char **buf, size_t *sizep)
 925 {
 926     size_t newsize = *sizep * 2;
 927
 928     if (newsize > *sizep) {
 929         *buf = xrealloc(*buf, newsize);
 930         *sizep = newsize;
 931         return true;
 932     }
 933
 934     return false;
 935 }
 936
 937 /* Parse and sanity check user_spec.
 938  *
 939  * If successful, set global variables 'uid' and 'gid'
 940  * with the parsed results. Global variable 'user'
 941  * will be pointing to a string that stores the name
 942  * of the user to be switched into.
 943  *
 944  * Also set 'switch_to_new_user' to true, The actual
 945  * user switching is done as soon as daemonize_start()
 946  * is called. I/O access before calling daemonize_start()
 947  * will still be with root's credential.  */
 948 void
 949 daemon_set_new_user(const char *user_spec)
 950 {
 951     char *pos = strchr(user_spec, ':');
 952     size_t init_bufsize, bufsize;
 953
 954     init_bufsize = get_sysconf_buffer_size();
 955     uid = getuid();
 956     gid = getgid();
 957
 958     if (geteuid() || uid) {
 959         VLOG_FATAL("%s: only root can use --user option", pidfile);
 960     }
 961
 962     user_spec += strspn(user_spec, " \t\r\n");
 963     size_t len = pos ? pos - user_spec : strlen(user_spec);
 964     char *buf;
 965     struct passwd pwd, *res;
 966     int e;
 967
 968     bufsize = init_bufsize;
 969     buf = xmalloc(bufsize);
 970     if (len) {
 971         user = xmemdup0(user_spec, len);
 972
 973         while ((e = getpwnam_r(user, &pwd, buf, bufsize, &res)) == ERANGE) {
 974             if (!enlarge_buffer(&buf, &bufsize)) {
 975                 break;
 976             }
 977         }
 978
 979         if (e != 0) {
 980             VLOG_FATAL("%s: Failed to retrive user %s's uid (%s), aborting.",
 981                        pidfile, user, ovs_strerror(e));
 982         }
 983     } else {
 984         /* User name is not specified, use current user.  */
 985         while ((e = getpwuid_r(uid, &pwd, buf, bufsize, &res)) == ERANGE) {
 986             if (!enlarge_buffer(&buf, &bufsize)) {
 987                 break;
 988             }
 989         }
 990
 991         if (e != 0) {
 992             VLOG_FATAL("%s: Failed to retrive current user's name "
 993                        "(%s), aborting.", pidfile, ovs_strerror(e));
 994         }
 995         user = xstrdup(pwd.pw_name);
 996     }
 997
 998     uid = pwd.pw_uid;
 999     gid = pwd.pw_gid;
1000     free(buf);
1001
1002     if (pos) {
1003         char *grpstr = pos + 1;
1004         grpstr += strspn(grpstr, " \t\r\n");
1005
1006         if (*grpstr) {
1007             struct group grp, *res;
1008
1009             bufsize = init_bufsize;
1010             buf = xmalloc(bufsize);
1011             while ((e = getgrnam_r(grpstr, &grp, buf, bufsize, &res))
1012                          == ERANGE) {
1013                 if (!enlarge_buffer(&buf, &bufsize)) {
1014                     break;
1015                 }
1016             }
1017
1018             if (e) {
1019                 VLOG_FATAL("%s: Failed to get group entry for %s, "
1020                            "(%s), aborting.", pidfile, grpstr,
1021                            ovs_strerror(e));
1022             }
1023
1024             if (gid != grp.gr_gid) {
1025                 char **mem;
1026
1027                 for (mem = grp.gr_mem; *mem; ++mem) {
1028                     if (!strcmp(*mem, user)) {
1029                         break;
1030                     }
1031                 }
1032
1033                 if (!*mem) {
1034                     VLOG_FATAL("%s: Invalid --user option %s (user %s is "
1035                                "not in group %s), aborting.", pidfile,
1036                                user_spec, user, grpstr);
1037                 }
1038                 gid = grp.gr_gid;
1039             }
1040             free(buf);
1041         }
1042     }
1043
1044     switch_user = non_root_user = true;
1045 }