lib/daemon-unix.c

   1 /*
   2  * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2015 Nicira, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at:
   7  *
   8  *     http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include <config.h>
  18 #include "daemon.h"
  19 #include "daemon-private.h"
  20 #include <errno.h>
  21 #include <fcntl.h>
  22 #include <grp.h>
  23 #include <pwd.h>
  24 #include <signal.h>
  25 #include <stdlib.h>
  26 #include <string.h>
  27 #include <sys/resource.h>
  28 #include <sys/wait.h>
  29 #include <sys/stat.h>
  30 #include <unistd.h>
  31 #if HAVE_LIBCAPNG
  32 #include <cap-ng.h>
  33 #endif
  34 #include "command-line.h"
  35 #include "fatal-signal.h"
  36 #include "dirs.h"
  37 #include "lockfile.h"
  38 #include "ovs-thread.h"
  39 #include "process.h"
  40 #include "socket-util.h"
  41 #include "timeval.h"
  42 #include "util.h"
  43 #include "openvswitch/vlog.h"
  44
  45 VLOG_DEFINE_THIS_MODULE(daemon_unix);
  46
  47 #ifdef __linux__
  48 #define LINUX 1
  49 #else
  50 #define LINUX 0
  51 #endif
  52
  53 #if HAVE_LIBCAPNG
  54 #define LIBCAPNG 1
  55 #else
  56 #define LIBCAPNG 0
  57 #endif
  58
  59 /* --detach: Should we run in the background? */
  60 bool detach;                    /* Was --detach specified? */
  61 static bool detached;           /* Have we already detached? */
  62
  63 /* --pidfile: Name of pidfile (null if none). */
  64 char *pidfile;
  65
  66 /* Device and inode of pidfile, so we can avoid reopening it. */
  67 static dev_t pidfile_dev;
  68 static ino_t pidfile_ino;
  69
  70 /* --overwrite-pidfile: Create pidfile even if one already exists and is
  71    locked? */
  72 static bool overwrite_pidfile;
  73
  74 /* --no-chdir: Should we chdir to "/"? */
  75 static bool chdir_ = true;
  76
  77 /* File descriptor used by daemonize_start() and daemonize_complete(). */
  78 static int daemonize_fd = -1;
  79
  80 /* --monitor: Should a supervisory process monitor the daemon and restart it if
  81  * it dies due to an error signal? */
  82 static bool monitor;
  83
  84 /* --user: Only root can use this option. Switch to new uid:gid after
  85  * initially running as root.  */
  86 static bool switch_user = false;
  87 static uid_t uid;
  88 static gid_t gid;
  89 static char *user = NULL;
  90 static void daemon_become_new_user__(bool access_datapath);
  91
  92 static void check_already_running(void);
  93 static int lock_pidfile(FILE *, int command);
  94 static pid_t fork_and_clean_up(void);
  95 static void daemonize_post_detach(void);
  96
  97 /* Returns the file name that would be used for a pidfile if 'name' were
  98  * provided to set_pidfile().  The caller must free the returned string. */
  99 char *
 100 make_pidfile_name(const char *name)
 101 {
 102     return (!name
 103             ? xasprintf("%s/%s.pid", ovs_rundir(), program_name)
 104             : abs_file_name(ovs_rundir(), name));
 105 }
 106
 107 /* Sets that we do not chdir to "/". */
 108 void
 109 set_no_chdir(void)
 110 {
 111     chdir_ = false;
 112 }
 113
 114 /* Normally, daemonize() or damonize_start() will terminate the program with a
 115  * message if a locked pidfile already exists.  If this function is called, an
 116  * existing pidfile will be replaced, with a warning. */
 117 void
 118 ignore_existing_pidfile(void)
 119 {
 120     overwrite_pidfile = true;
 121 }
 122
 123 /* Sets up a following call to daemonize() to detach from the foreground
 124  * session, running this process in the background.  */
 125 void
 126 set_detach(void)
 127 {
 128     detach = true;
 129 }
 130
 131 /* Sets up a following call to daemonize() to fork a supervisory process to
 132  * monitor the daemon and restart it if it dies due to an error signal.  */
 133 void
 134 daemon_set_monitor(void)
 135 {
 136     monitor = true;
 137 }
 138
 139 /* If a pidfile has been configured, creates it and stores the running
 140  * process's pid in it.  Ensures that the pidfile will be deleted when the
 141  * process exits. */
 142 static void
 143 make_pidfile(void)
 144 {
 145     long int pid = getpid();
 146     struct stat s;
 147     char *tmpfile;
 148     FILE *file;
 149     int error;
 150
 151     /* Create a temporary pidfile. */
 152     if (overwrite_pidfile) {
 153         tmpfile = xasprintf("%s.tmp%ld", pidfile, pid);
 154         fatal_signal_add_file_to_unlink(tmpfile);
 155     } else {
 156         /* Everyone shares the same file which will be treated as a lock.  To
 157          * avoid some uncomfortable race conditions, we can't set up the fatal
 158          * signal unlink until we've acquired it. */
 159         tmpfile = xasprintf("%s.tmp", pidfile);
 160     }
 161
 162     file = fopen(tmpfile, "a+");
 163     if (!file) {
 164         VLOG_FATAL("%s: create failed (%s)", tmpfile, ovs_strerror(errno));
 165     }
 166
 167     error = lock_pidfile(file, F_SETLK);
 168     if (error) {
 169         /* Looks like we failed to acquire the lock.  Note that, if we failed
 170          * for some other reason (and '!overwrite_pidfile'), we will have
 171          * left 'tmpfile' as garbage in the file system. */
 172         VLOG_FATAL("%s: fcntl(F_SETLK) failed (%s)", tmpfile,
 173                    ovs_strerror(error));
 174     }
 175
 176     if (!overwrite_pidfile) {
 177         /* We acquired the lock.  Make sure to clean up on exit, and verify
 178          * that we're allowed to create the actual pidfile. */
 179         fatal_signal_add_file_to_unlink(tmpfile);
 180         check_already_running();
 181     }
 182
 183     if (fstat(fileno(file), &s) == -1) {
 184         VLOG_FATAL("%s: fstat failed (%s)", tmpfile, ovs_strerror(errno));
 185     }
 186
 187     if (ftruncate(fileno(file), 0) == -1) {
 188         VLOG_FATAL("%s: truncate failed (%s)", tmpfile, ovs_strerror(errno));
 189     }
 190
 191     fprintf(file, "%ld\n", pid);
 192     if (fflush(file) == EOF) {
 193         VLOG_FATAL("%s: write failed (%s)", tmpfile, ovs_strerror(errno));
 194     }
 195
 196     error = rename(tmpfile, pidfile);
 197
 198     /* Due to a race, 'tmpfile' may be owned by a different process, so we
 199      * shouldn't delete it on exit. */
 200     fatal_signal_remove_file_to_unlink(tmpfile);
 201
 202     if (error < 0) {
 203         VLOG_FATAL("failed to rename \"%s\" to \"%s\" (%s)",
 204                    tmpfile, pidfile, ovs_strerror(errno));
 205     }
 206
 207     /* Ensure that the pidfile will get deleted on exit. */
 208     fatal_signal_add_file_to_unlink(pidfile);
 209
 210     /* Clean up.
 211      *
 212      * We don't close 'file' because its file descriptor must remain open to
 213      * hold the lock. */
 214     pidfile_dev = s.st_dev;
 215     pidfile_ino = s.st_ino;
 216     free(tmpfile);
 217 }
 218
 219 /* Calls fork() and on success returns its return value.  On failure, logs an
 220  * error and exits unsuccessfully.
 221  *
 222  * Post-fork, but before returning, this function calls a few other functions
 223  * that are generally useful if the child isn't planning to exec a new
 224  * process. */
 225 static pid_t
 226 fork_and_clean_up(void)
 227 {
 228     pid_t pid = xfork();
 229     if (pid > 0) {
 230         /* Running in parent process. */
 231         fatal_signal_fork();
 232     } else if (!pid) {
 233         /* Running in child process. */
 234         lockfile_postfork();
 235     }
 236     return pid;
 237 }
 238
 239 /* Forks, then:
 240  *
 241  *   - In the parent, waits for the child to signal that it has completed its
 242  *     startup sequence.  Then stores -1 in '*fdp' and returns the child's
 243  *     pid in '*child_pid' argument.
 244  *
 245  *   - In the child, stores a fd in '*fdp' and returns 0 through '*child_pid'
 246  *     argument.  The caller should pass the fd to fork_notify_startup() after
 247  *     it finishes its startup sequence.
 248  *
 249  * Returns 0 on success.  If something goes wrong and child process was not
 250  * able to signal its readiness by calling fork_notify_startup(), then this
 251  * function returns -1. However, even in case of failure it still sets child
 252  * process id in '*child_pid'. */
 253 static int
 254 fork_and_wait_for_startup(int *fdp, pid_t *child_pid)
 255 {
 256     int fds[2];
 257     pid_t pid;
 258     int ret = 0;
 259
 260     xpipe(fds);
 261
 262     pid = fork_and_clean_up();
 263     if (pid > 0) {
 264         /* Running in parent process. */
 265         size_t bytes_read;
 266         char c;
 267
 268         close(fds[1]);
 269         if (read_fully(fds[0], &c, 1, &bytes_read) != 0) {
 270             int retval;
 271             int status;
 272
 273             do {
 274                 retval = waitpid(pid, &status, 0);
 275             } while (retval == -1 && errno == EINTR);
 276
 277             if (retval == pid) {
 278                 if (WIFEXITED(status) && WEXITSTATUS(status)) {
 279                     /* Child exited with an error.  Convey the same error
 280                      * to our parent process as a courtesy. */
 281                     exit(WEXITSTATUS(status));
 282                 } else {
 283                     char *status_msg = process_status_msg(status);
 284                     VLOG_ERR("fork child died before signaling startup (%s)",
 285                              status_msg);
 286                     ret = -1;
 287                 }
 288             } else if (retval < 0) {
 289                 VLOG_FATAL("waitpid failed (%s)", ovs_strerror(errno));
 290             } else {
 291                 OVS_NOT_REACHED();
 292             }
 293         }
 294         close(fds[0]);
 295         *fdp = -1;
 296     } else if (!pid) {
 297         /* Running in child process. */
 298         close(fds[0]);
 299         *fdp = fds[1];
 300     }
 301     *child_pid = pid;
 302     return ret;
 303 }
 304
 305 static void
 306 fork_notify_startup(int fd)
 307 {
 308     if (fd != -1) {
 309         size_t bytes_written;
 310         int error;
 311
 312         error = write_fully(fd, "", 1, &bytes_written);
 313         if (error) {
 314             VLOG_FATAL("pipe write failed (%s)", ovs_strerror(error));
 315         }
 316
 317         close(fd);
 318     }
 319 }
 320
 321 static bool
 322 should_restart(int status)
 323 {
 324     if (WIFSIGNALED(status)) {
 325         static const int error_signals[] = {
 326             /* This list of signals is documented in daemon.man.  If you
 327              * change the list, update the documentation too. */
 328             SIGABRT, SIGALRM, SIGBUS, SIGFPE, SIGILL, SIGPIPE, SIGSEGV,
 329             SIGXCPU, SIGXFSZ
 330         };
 331
 332         size_t i;
 333
 334         for (i = 0; i < ARRAY_SIZE(error_signals); i++) {
 335             if (error_signals[i] == WTERMSIG(status)) {
 336                 return true;
 337             }
 338         }
 339     }
 340     return false;
 341 }
 342
 343 static void
 344 monitor_daemon(pid_t daemon_pid)
 345 {
 346     /* XXX Should log daemon's stderr output at startup time. */
 347     time_t last_restart;
 348     char *status_msg;
 349     int crashes;
 350     bool child_ready = true;
 351
 352     set_subprogram_name("monitor");
 353     status_msg = xstrdup("healthy");
 354     last_restart = TIME_MIN;
 355     crashes = 0;
 356     for (;;) {
 357         int retval;
 358         int status;
 359
 360         ovs_cmdl_proctitle_set("monitoring pid %lu (%s)",
 361                                (unsigned long int) daemon_pid, status_msg);
 362
 363         if (child_ready) {
 364             do {
 365                 retval = waitpid(daemon_pid, &status, 0);
 366             } while (retval == -1 && errno == EINTR);
 367             if (retval == -1) {
 368                 VLOG_FATAL("waitpid failed (%s)", ovs_strerror(errno));
 369             }
 370         }
 371
 372         if (!child_ready || retval == daemon_pid) {
 373             char *s = process_status_msg(status);
 374             if (should_restart(status)) {
 375                 free(status_msg);
 376                 status_msg = xasprintf("%d crashes: pid %lu died, %s",
 377                                        ++crashes,
 378                                        (unsigned long int) daemon_pid, s);
 379                 free(s);
 380
 381                 if (WCOREDUMP(status)) {
 382                     /* Disable further core dumps to save disk space. */
 383                     struct rlimit r;
 384
 385                     r.rlim_cur = 0;
 386                     r.rlim_max = 0;
 387                     if (setrlimit(RLIMIT_CORE, &r) == -1) {
 388                         VLOG_WARN("failed to disable core dumps: %s",
 389                                   ovs_strerror(errno));
 390                     }
 391                 }
 392
 393                 /* Throttle restarts to no more than once every 10 seconds. */
 394                 if (time(NULL) < last_restart + 10) {
 395                     VLOG_WARN("%s, waiting until 10 seconds since last "
 396                               "restart", status_msg);
 397                     for (;;) {
 398                         time_t now = time(NULL);
 399                         time_t wakeup = last_restart + 10;
 400                         if (now >= wakeup) {
 401                             break;
 402                         }
 403                         xsleep(wakeup - now);
 404                     }
 405                 }
 406                 last_restart = time(NULL);
 407
 408                 VLOG_ERR("%s, restarting", status_msg);
 409                 child_ready = !fork_and_wait_for_startup(&daemonize_fd,
 410                                                          &daemon_pid);
 411                 if (child_ready && !daemon_pid) {
 412                     /* Child process needs to break out of monitoring
 413                      * loop. */
 414                     break;
 415                 }
 416             } else {
 417                 VLOG_INFO("pid %lu died, %s, exiting",
 418                           (unsigned long int) daemon_pid, s);
 419                 free(s);
 420                 exit(0);
 421             }
 422         }
 423     }
 424     free(status_msg);
 425
 426     /* Running in new daemon process. */
 427     ovs_cmdl_proctitle_restore();
 428     set_subprogram_name("");
 429 }
 430
 431 /* If daemonization is configured, then starts daemonization, by forking and
 432  * returning in the child process.  The parent process hangs around until the
 433  * child lets it know either that it completed startup successfully (by calling
 434  * daemon_complete()) or that it failed to start up (by exiting with a nonzero
 435  * exit code). */
 436 void
 437 daemonize_start(bool access_datapath)
 438 {
 439     assert_single_threaded();
 440     daemonize_fd = -1;
 441
 442     if (switch_user) {
 443         daemon_become_new_user__(access_datapath);
 444         switch_user = false;
 445     }
 446
 447     if (detach) {
 448         pid_t pid;
 449
 450         if (fork_and_wait_for_startup(&daemonize_fd, &pid)) {
 451             VLOG_FATAL("could not detach from foreground session");
 452         }
 453         if (pid > 0) {
 454             /* Running in parent process. */
 455             exit(0);
 456         }
 457
 458         /* Running in daemon or monitor process. */
 459         setsid();
 460     }
 461
 462     if (monitor) {
 463         int saved_daemonize_fd = daemonize_fd;
 464         pid_t daemon_pid;
 465
 466         if (fork_and_wait_for_startup(&daemonize_fd, &daemon_pid)) {
 467             VLOG_FATAL("could not initiate process monitoring");
 468         }
 469         if (daemon_pid > 0) {
 470             /* Running in monitor process. */
 471             fork_notify_startup(saved_daemonize_fd);
 472             close_standard_fds();
 473             monitor_daemon(daemon_pid);
 474         }
 475         /* Running in daemon process. */
 476     }
 477
 478     forbid_forking("running in daemon process");
 479
 480     if (pidfile) {
 481         make_pidfile();
 482     }
 483
 484     /* Make sure that the unixctl commands for vlog get registered in a
 485      * daemon, even before the first log message. */
 486     vlog_init();
 487 }
 488
 489 /* If daemonization is configured, then this function notifies the parent
 490  * process that the child process has completed startup successfully.  It also
 491  * call daemonize_post_detach().
 492  *
 493  * Calling this function more than once has no additional effect. */
 494 void
 495 daemonize_complete(void)
 496 {
 497     if (pidfile) {
 498         free(pidfile);
 499         pidfile = NULL;
 500     }
 501
 502     if (!detached) {
 503         detached = true;
 504
 505         fork_notify_startup(daemonize_fd);
 506         daemonize_fd = -1;
 507         daemonize_post_detach();
 508     }
 509 }
 510
 511 /* If daemonization is configured, then this function does traditional Unix
 512  * daemonization behavior: join a new session, chdir to the root (if not
 513  * disabled), and close the standard file descriptors.
 514  *
 515  * It only makes sense to call this function as part of an implementation of a
 516  * special daemon subprocess.  A normal daemon should just call
 517  * daemonize_complete(). */
 518 static void
 519 daemonize_post_detach(void)
 520 {
 521     if (detach) {
 522         if (chdir_) {
 523             ignore(chdir("/"));
 524         }
 525         close_standard_fds();
 526     }
 527 }
 528
 529 void
 530 daemon_usage(void)
 531 {
 532     printf(
 533         "\nDaemon options:\n"
 534         "  --detach                run in background as daemon\n"
 535         "  --no-chdir              do not chdir to '/'\n"
 536         "  --pidfile[=FILE]        create pidfile (default: %s/%s.pid)\n"
 537         "  --overwrite-pidfile     with --pidfile, start even if already "
 538                                    "running\n",
 539         ovs_rundir(), program_name);
 540 }
 541
 542 static int
 543 lock_pidfile__(FILE *file, int command, struct flock *lck)
 544 {
 545     int error;
 546
 547     lck->l_type = F_WRLCK;
 548     lck->l_whence = SEEK_SET;
 549     lck->l_start = 0;
 550     lck->l_len = 0;
 551     lck->l_pid = 0;
 552
 553     do {
 554         error = fcntl(fileno(file), command, lck) == -1 ? errno : 0;
 555     } while (error == EINTR);
 556     return error;
 557 }
 558
 559 static int
 560 lock_pidfile(FILE *file, int command)
 561 {
 562     struct flock lck;
 563
 564     return lock_pidfile__(file, command, &lck);
 565 }
 566
 567 static pid_t
 568 read_pidfile__(const char *pidfile, bool delete_if_stale)
 569 {
 570     struct stat s, s2;
 571     struct flock lck;
 572     char line[128];
 573     FILE *file;
 574     int error;
 575
 576     if ((pidfile_ino || pidfile_dev)
 577         && !stat(pidfile, &s)
 578         && s.st_ino == pidfile_ino && s.st_dev == pidfile_dev) {
 579         /* It's our own pidfile.  We can't afford to open it, because closing
 580          * *any* fd for a file that a process has locked also releases all the
 581          * locks on that file.
 582          *
 583          * Fortunately, we know the associated pid anyhow: */
 584         return getpid();
 585     }
 586
 587     file = fopen(pidfile, "r+");
 588     if (!file) {
 589         if (errno == ENOENT && delete_if_stale) {
 590             return 0;
 591         }
 592         error = errno;
 593         VLOG_WARN("%s: open: %s", pidfile, ovs_strerror(error));
 594         goto error;
 595     }
 596
 597     error = lock_pidfile__(file, F_GETLK, &lck);
 598     if (error) {
 599         VLOG_WARN("%s: fcntl: %s", pidfile, ovs_strerror(error));
 600         goto error;
 601     }
 602     if (lck.l_type == F_UNLCK) {
 603         /* pidfile exists but it isn't locked by anyone.  We need to delete it
 604          * so that a new pidfile can go in its place.  But just calling
 605          * unlink(pidfile) makes a nasty race: what if someone else unlinks it
 606          * before we do and then replaces it by a valid pidfile?  We'd unlink
 607          * their valid pidfile.  We do a little dance to avoid the race, by
 608          * locking the invalid pidfile.  Only one process can have the invalid
 609          * pidfile locked, and only that process has the right to unlink it. */
 610         if (!delete_if_stale) {
 611             error = ESRCH;
 612             VLOG_DBG("%s: pid file is stale", pidfile);
 613             goto error;
 614         }
 615
 616         /* Get the lock. */
 617         error = lock_pidfile(file, F_SETLK);
 618         if (error) {
 619             /* We lost a race with someone else doing the same thing. */
 620             VLOG_WARN("%s: lost race to lock pidfile", pidfile);
 621             goto error;
 622         }
 623
 624         /* Is the file we have locked still named 'pidfile'? */
 625         if (stat(pidfile, &s) || fstat(fileno(file), &s2)
 626             || s.st_ino != s2.st_ino || s.st_dev != s2.st_dev) {
 627             /* No.  We lost a race with someone else who got the lock before
 628              * us, deleted the pidfile, and closed it (releasing the lock). */
 629             error = EALREADY;
 630             VLOG_WARN("%s: lost race to delete pidfile", pidfile);
 631             goto error;
 632         }
 633
 634         /* We won the right to delete the stale pidfile. */
 635         if (unlink(pidfile)) {
 636             error = errno;
 637             VLOG_WARN("%s: failed to delete stale pidfile (%s)",
 638                       pidfile, ovs_strerror(error));
 639             goto error;
 640         }
 641         VLOG_DBG("%s: deleted stale pidfile", pidfile);
 642         fclose(file);
 643         return 0;
 644     }
 645
 646     if (!fgets(line, sizeof line, file)) {
 647         if (ferror(file)) {
 648             error = errno;
 649             VLOG_WARN("%s: read: %s", pidfile, ovs_strerror(error));
 650         } else {
 651             error = ESRCH;
 652             VLOG_WARN("%s: read: unexpected end of file", pidfile);
 653         }
 654         goto error;
 655     }
 656
 657     if (lck.l_pid != strtoul(line, NULL, 10)) {
 658         /* The process that has the pidfile locked is not the process that
 659          * created it.  It must be stale, with the process that has it locked
 660          * preparing to delete it. */
 661         error = ESRCH;
 662         VLOG_WARN("%s: stale pidfile for pid %s being deleted by pid %ld",
 663                   pidfile, line, (long int) lck.l_pid);
 664         goto error;
 665     }
 666
 667     fclose(file);
 668     return lck.l_pid;
 669
 670 error:
 671     if (file) {
 672         fclose(file);
 673     }
 674     return -error;
 675 }
 676
 677 /* Opens and reads a PID from 'pidfile'.  Returns the positive PID if
 678  * successful, otherwise a negative errno value. */
 679 pid_t
 680 read_pidfile(const char *pidfile)
 681 {
 682     return read_pidfile__(pidfile, false);
 683 }
 684
 685 /* Checks whether a process with the given 'pidfile' is already running and,
 686  * if so, aborts.  If 'pidfile' is stale, deletes it. */
 687 static void
 688 check_already_running(void)
 689 {
 690     long int pid = read_pidfile__(pidfile, true);
 691     if (pid > 0) {
 692         VLOG_FATAL("%s: already running as pid %ld, aborting", pidfile, pid);
 693     } else if (pid < 0) {
 694         VLOG_FATAL("%s: pidfile check failed (%s), aborting",
 695                    pidfile, ovs_strerror(-pid));
 696     }
 697 }
 698
 699 \f
 700 /* stub functions for non-windows platform. */
 701
 702 void
 703 service_start(int *argc OVS_UNUSED, char **argv[] OVS_UNUSED)
 704 {
 705 }
 706
 707 void
 708 service_stop(void)
 709 {
 710 }
 711
 712 bool
 713 should_service_stop(void)
 714 {
 715     return false;
 716 }
 717
 718 \f
 719 static bool
 720 gid_matches(gid_t expected, gid_t value)
 721 {
 722     return expected == -1 || expected == value;
 723 }
 724
 725 static bool
 726 gid_verify(gid_t gid)
 727 {
 728     gid_t r, e;
 729
 730     r = getgid();
 731     e = getegid();
 732     return (gid_matches(gid, r) &&
 733             gid_matches(gid, e));
 734 }
 735
 736 static void
 737 daemon_switch_group(gid_t gid)
 738 {
 739     if ((setgid(gid) == -1) || !gid_verify(gid)) {
 740         VLOG_FATAL("%s: fail to switch group to gid as %d, aborting",
 741                    pidfile, gid);
 742     }
 743 }
 744
 745 static bool
 746 uid_matches(uid_t expected, uid_t value)
 747 {
 748     return expected == -1 || expected == value;
 749 }
 750
 751 static bool
 752 uid_verify(const uid_t uid)
 753 {
 754     uid_t r, e;
 755
 756     r = getuid();
 757     e = geteuid();
 758     return (uid_matches(uid, r) &&
 759             uid_matches(uid, e));
 760 }
 761
 762 static void
 763 daemon_switch_user(const uid_t uid, const char *user)
 764 {
 765     if ((setuid(uid) == -1) || !uid_verify(uid)) {
 766         VLOG_FATAL("%s: fail to switch user to %s, aborting",
 767                    pidfile, user);
 768     }
 769 }
 770
 771 /* Use portable Unix APIs to switch uid:gid, when datapath
 772  * access is not required.  On Linux systems, all capabilities
 773  * will be dropped.  */
 774 static void
 775 daemon_become_new_user_unix(void)
 776 {
 777     /* "Setuid Demystified" by Hao Chen, etc outlines some caveats of
 778      * around unix system call setuid() and friends. This implementation
 779      * mostly follow the advice given by the paper.  The paper is
 780      * published in 2002, so things could have changed.  */
 781
 782     /* Change both real and effective uid and gid will permanently
 783      * drop the process' privilege.  "Setuid Demystified" suggested
 784      * that calling getuid() after each setuid() call to verify they
 785      * are actually set, because checking return code alone is not
 786      * sufficient.  */
 787     daemon_switch_group(gid);
 788     if (user && initgroups(user, gid) == -1) {
 789         VLOG_FATAL("%s: fail to add supplementary group gid %d, "
 790                    "aborting", pidfile, gid);
 791     }
 792     daemon_switch_user(uid, user);
 793 }
 794
 795 /* Linux specific implementation of daemon_become_new_user()
 796  * using libcap-ng.   */
 797 static void
 798 daemon_become_new_user_linux(bool access_datapath OVS_UNUSED)
 799 {
 800 #if defined __linux__ &&  HAVE_LIBCAPNG
 801     int ret;
 802
 803     ret = capng_get_caps_process();
 804
 805     if (!ret) {
 806         if (capng_have_capabilities(CAPNG_SELECT_CAPS) > CAPNG_NONE) {
 807             const capng_type_t cap_sets = CAPNG_EFFECTIVE|CAPNG_PERMITTED;
 808
 809             capng_clear(CAPNG_SELECT_BOTH);
 810
 811             ret = capng_update(CAPNG_ADD, cap_sets, CAP_IPC_LOCK)
 812                   || capng_update(CAPNG_ADD, cap_sets, CAP_NET_BIND_SERVICE);
 813
 814             if (access_datapath && !ret) {
 815                 ret = capng_update(CAPNG_ADD, cap_sets, CAP_NET_ADMIN)
 816                       || capng_update(CAPNG_ADD, cap_sets, CAP_NET_RAW);
 817             }
 818         } else {
 819             ret = -1;
 820         }
 821     }
 822
 823     if (!ret) {
 824         /* CAPNG_INIT_SUPP_GRP will be a better choice than
 825          * CAPNG_DROP_SUPP_GRP. However this enum value is only defined
 826          * with libcap-ng higher than version 0.7.4, which is not wildly
 827          * available on many Linux distributions yet. Taking a more
 828          * conservative approach to make sure OVS behaves consistently.
 829          *
 830          * XXX We may change this for future OVS releases.
 831          */
 832         ret = capng_change_id(uid, gid, CAPNG_DROP_SUPP_GRP
 833                               | CAPNG_CLEAR_BOUNDING);
 834     }
 835
 836     if (ret) {
 837         VLOG_FATAL("%s: libcap-ng fail to switch to user and group "
 838                    "%d:%d, aborting", pidfile, uid, gid);
 839     }
 840 #endif
 841 }
 842
 843 static void
 844 daemon_become_new_user__(bool access_datapath)
 845 {
 846     /* If vlog file has been created, change its owner to the non-root user
 847      * as specifed by the --user option.  */
 848     vlog_change_owner_unix(uid, gid);
 849
 850     if (LINUX) {
 851         if (LIBCAPNG) {
 852             daemon_become_new_user_linux(access_datapath);
 853         } else {
 854             VLOG_FATAL("%s: fail to downgrade user using libcap-ng. "
 855                        "(libcap-ng is not configured at compile time), "
 856                        "aborting.", pidfile);
 857         }
 858     } else {
 859         daemon_become_new_user_unix();
 860     }
 861 }
 862
 863 /* Noramlly, user switch is embedded within daemonize_start().
 864  * However, there in case the user switch needs to be done
 865  * before daemonize_start(), the following API can be used.  */
 866 void
 867 daemon_become_new_user(bool access_datapath)
 868 {
 869     assert_single_threaded();
 870     if (switch_user) {
 871         daemon_become_new_user__(access_datapath);
 872         /* daemonize_start() should not switch user again. */
 873         switch_user = false;
 874     }
 875 }
 876
 877 /* Return the maximun suggested buffer size for both getpwname_r()
 878  * and getgrnam_r().
 879  *
 880  * This size may still not be big enough. in case getpwname_r()
 881  * and friends return ERANGE, a larger buffer should be supplied to
 882  * retry. (The man page did not specify the max size to stop at, we
 883  * will keep trying with doubling the buffer size for each round until
 884  * the size wrapps around size_t.  */
 885 static size_t
 886 get_sysconf_buffer_size(void)
 887 {
 888     size_t bufsize, pwd_bs = 0, grp_bs = 0;
 889     const size_t default_bufsize = 1024;
 890
 891     errno = 0;
 892     if ((pwd_bs = sysconf(_SC_GETPW_R_SIZE_MAX)) == -1) {
 893         if (errno) {
 894             VLOG_FATAL("%s: Read initial passwordd struct size "
 895                        "failed (%s), aborting. ", pidfile,
 896                        ovs_strerror(errno));
 897         }
 898     }
 899
 900     if ((grp_bs = sysconf(_SC_GETGR_R_SIZE_MAX)) == -1) {
 901         if (errno) {
 902             VLOG_FATAL("%s: Read initial group struct size "
 903                        "failed (%s), aborting. ", pidfile,
 904                        ovs_strerror(errno));
 905         }
 906     }
 907
 908     bufsize = MAX(pwd_bs, grp_bs);
 909     return bufsize ? bufsize : default_bufsize;
 910 }
 911
 912 /* Try to double the size of '*buf', return true
 913  * if successful, and '*sizep' will be updated with
 914  * the new size. Otherwise, return false.  */
 915 static bool
 916 enlarge_buffer(char **buf, size_t *sizep)
 917 {
 918     size_t newsize = *sizep * 2;
 919
 920     if (newsize > *sizep) {
 921         *buf = xrealloc(*buf, newsize);
 922         *sizep = newsize;
 923         return true;
 924     }
 925
 926     return false;
 927 }
 928
 929 /* Parse and sanity check user_spec.
 930  *
 931  * If successful, set global variables 'uid' and 'gid'
 932  * with the parsed results. Global variable 'user'
 933  * will be pointing to a string that stores the name
 934  * of the user to be switched into.
 935  *
 936  * Also set 'switch_to_new_user' to true, The actual
 937  * user switching is done as soon as daemonize_start()
 938  * is called. I/O access before calling daemonize_start()
 939  * will still be with root's credential.  */
 940 void
 941 daemon_set_new_user(const char *user_spec)
 942 {
 943     char *pos = strchr(user_spec, ':');
 944     size_t init_bufsize, bufsize;
 945
 946     init_bufsize = get_sysconf_buffer_size();
 947     uid = getuid();
 948     gid = getgid();
 949
 950     if (geteuid() || uid) {
 951         VLOG_FATAL("%s: only root can use --user option", pidfile);
 952     }
 953
 954     user_spec += strspn(user_spec, " \t\r\n");
 955     size_t len = pos ? pos - user_spec : strlen(user_spec);
 956     char *buf;
 957     struct passwd pwd, *res;
 958     int e;
 959
 960     bufsize = init_bufsize;
 961     buf = xmalloc(bufsize);
 962     if (len) {
 963         user = xmemdup0(user_spec, len);
 964
 965         while ((e = getpwnam_r(user, &pwd, buf, bufsize, &res)) == ERANGE) {
 966             if (!enlarge_buffer(&buf, &bufsize)) {
 967                 break;
 968             }
 969         }
 970
 971         if (e != 0) {
 972             VLOG_FATAL("%s: Failed to retrive user %s's uid (%s), aborting.",
 973                        pidfile, user, ovs_strerror(e));
 974         }
 975     } else {
 976         /* User name is not specified, use current user.  */
 977         while ((e = getpwuid_r(uid, &pwd, buf, bufsize, &res)) == ERANGE) {
 978             if (!enlarge_buffer(&buf, &bufsize)) {
 979                 break;
 980             }
 981         }
 982
 983         if (e != 0) {
 984             VLOG_FATAL("%s: Failed to retrive current user's name "
 985                        "(%s), aborting.", pidfile, ovs_strerror(e));
 986         }
 987         user = xstrdup(pwd.pw_name);
 988     }
 989
 990     uid = pwd.pw_uid;
 991     gid = pwd.pw_gid;
 992     free(buf);
 993
 994     if (pos) {
 995         char *grpstr = pos + 1;
 996         grpstr += strspn(grpstr, " \t\r\n");
 997
 998         if (*grpstr) {
 999             struct group grp, *res;
1000
1001             bufsize = init_bufsize;
1002             buf = xmalloc(bufsize);
1003             while ((e = getgrnam_r(grpstr, &grp, buf, bufsize, &res))
1004                          == ERANGE) {
1005                 if (!enlarge_buffer(&buf, &bufsize)) {
1006                     break;
1007                 }
1008             }
1009
1010             if (e) {
1011                 VLOG_FATAL("%s: Failed to get group entry for %s, "
1012                            "(%s), aborting.", pidfile, grpstr,
1013                            ovs_strerror(e));
1014             }
1015
1016             if (gid != grp.gr_gid) {
1017                 char **mem;
1018
1019                 for (mem = grp.gr_mem; *mem; ++mem) {
1020                     if (!strcmp(*mem, user)) {
1021                         break;
1022                     }
1023                 }
1024
1025                 if (!*mem) {
1026                     VLOG_FATAL("%s: Invalid --user option %s (user %s is "
1027                                "not in group %s), aborting.", pidfile,
1028                                user_spec, user, grpstr);
1029                 }
1030                 gid = grp.gr_gid;
1031             }
1032             free(buf);
1033         }
1034     }
1035
1036     switch_user = true;
1037 }