restore.c 17.6 KB
Newer Older
O
overweight 已提交
1 2
/******************************************************************************
 * Copyright (c) Huawei Technologies Co., Ltd. 2017-2019. All rights reserved.
3 4 5 6
 * iSulad licensed under the Mulan PSL v2.
 * You can use this software according to the terms and conditions of the Mulan PSL v2.
 * You may obtain a copy of Mulan PSL v2 at:
 *     http://license.coscl.org.cn/MulanPSL2
O
overweight 已提交
7 8 9
 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
 * PURPOSE.
10
 * See the Mulan PSL v2 for more details.
O
overweight 已提交
11 12 13 14 15 16 17 18 19 20 21
 * Author: tanyifeng
 * Create: 2017-11-22
 * Description: provide container list callback function definition
 ********************************************************************************/

#include <stdio.h>
#include <dlfcn.h>
#include <unistd.h>
#include <limits.h>
#include <pthread.h>

L
LiuHao 已提交
22
#include "isulad_config.h"
H
haozi007 已提交
23
#include "isula_libutils/log.h"
O
overweight 已提交
24 25 26 27 28 29 30
#include "restore.h"
#include "containers_store.h"
#include "supervisor.h"
#include "containers_gc.h"
#include "container_unix.h"
#include "error.h"
#include "image.h"
L
LiFeng 已提交
31
#include "runtime.h"
L
lifeng68 已提交
32
#include "service_container.h"
O
overweight 已提交
33 34

/* restore supervisor */
L
LiFeng 已提交
35
static int restore_supervisor(const container_t *cont)
O
overweight 已提交
36 37 38 39 40 41
{
    int ret = 0;
    int nret = 0;
    int exit_fifo_fd = -1;
    char container_state[PATH_MAX] = { 0 };
    char *exit_fifo = NULL;
L
LiFeng 已提交
42 43 44 45
    char *id = cont->common_config->id;
    char *statepath = cont->state_path;
    char *runtime = cont->runtime;
    container_pid_t pid_info = { 0 };
O
overweight 已提交
46

O
openeuler-iSula 已提交
47 48
    nret = snprintf(container_state, sizeof(container_state), "%s/%s", statepath, id);
    if (nret < 0 || (size_t)nret >= sizeof(container_state)) {
O
overweight 已提交
49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
        ERROR("Failed to sprintf container state %s/%s", statepath, id);
        ret = -1;
        goto out;
    }

    exit_fifo = exit_fifo_name(container_state);
    if (exit_fifo == NULL) {
        ERROR("Failed to get exit fifo name %s/%s", statepath, id);
        ret = -1;
        goto out;
    }

    exit_fifo_fd = exit_fifo_open(exit_fifo);
    if (exit_fifo_fd < 0) {
        ERROR("Failed to open exit FIFO %s", exit_fifo);
        ret = -1;
        goto out;
    }

L
LiFeng 已提交
68 69 70 71
    pid_info.pid = cont->state->state->pid;
    pid_info.ppid = cont->state->state->p_pid;
    pid_info.start_time = cont->state->state->start_time;
    pid_info.pstart_time = cont->state->state->p_start_time;
O
overweight 已提交
72

L
LiFeng 已提交
73
    if (supervisor_add_exit_monitor(exit_fifo_fd, &pid_info, id, runtime)) {
O
overweight 已提交
74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
        ERROR("Failed to add exit monitor to supervisor");
        ret = -1;
        goto out;
    }

out:
    free(exit_fifo);

    return ret;
}

/* post stopped container to gc */
static int post_stopped_container_to_gc(const char *id, const char *runtime, const char *statepath, uint32_t pid)
{
    int ret = 0;
L
LiFeng 已提交
89
    container_pid_t pid_info = { 0 };
O
overweight 已提交
90

L
LiFeng 已提交
91
    (void)container_read_proc(pid, &pid_info);
O
overweight 已提交
92

L
LiFeng 已提交
93
    if (gc_add_container(id, runtime, &pid_info)) {
O
overweight 已提交
94 95 96 97 98 99 100 101 102 103
        ERROR("Failed to post container %s to garbage collector", id);
        ret = -1;
        goto out;
    }

out:
    return ret;
}

static void post_nonexist_image_containers(const container_t *cont, Container_Status status,
104
                                           const struct runtime_container_status_info *info)
O
overweight 已提交
105 106 107 108
{
    int nret;
    const char *id = cont->common_config->id;

109
    if (info->status == RUNTIME_CONTAINER_STATUS_STOPPED) {
110
        if (status != CONTAINER_STATUS_STOPPED && status != CONTAINER_STATUS_CREATED) {
O
overweight 已提交
111 112 113 114
            nret = post_stopped_container_to_gc(id, cont->runtime, cont->state_path, 0);
            if (nret != 0) {
                ERROR("Failed to post container %s to garbage"
                      "collector, that may lost some resources"
115 116
                      "used with container!",
                      id);
O
overweight 已提交
117 118 119
            }
            state_set_stopped(cont->state, 255);
        }
120
    } else if (info->status == RUNTIME_CONTAINER_STATUS_RUNNING) {
O
overweight 已提交
121 122 123 124
        nret = post_stopped_container_to_gc(id, cont->runtime, cont->state_path, info->pid);
        if (nret != 0) {
            ERROR("Failed to post container %s to garbage"
                  "collector, that may lost some resources"
125 126
                  "used with container!",
                  id);
O
overweight 已提交
127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151
        }
        state_set_stopped(cont->state, 255);
    } else {
        ERROR("Container %s get invalid status %d", id, info->status);
    }

    return;
}

static int check_container_image_exist(const container_t *cont)
{
    int ret = 0;
    char *tmp = NULL;
    const char *id = cont->common_config->id;
    const char *image_name = cont->common_config->image;
    const char *image_type = cont->common_config->image_type;

    if (image_type == NULL || image_name == NULL) {
        ERROR("Failed to get image type for container %s", id);
        ret = -1;
        goto out;
    }

    /* only check exist for oci image */
    if (strcmp(image_type, IMAGE_TYPE_OCI) == 0) {
D
dogsheng 已提交
152 153
        ret = im_resolv_image_name(image_type, image_name, &tmp);
        if (ret != 0) {
O
overweight 已提交
154 155 156
            ERROR("Failed to resolve image %s", image_name);
            goto out;
        }
L
LiFeng 已提交
157

158
        if (!im_oci_image_exist(tmp)) {
O
overweight 已提交
159 160 161 162 163 164 165 166 167 168 169
            WARN("Image %s not exist", tmp);
            ret = -1;
            goto out;
        }
    }

out:
    free(tmp);
    return ret;
}

L
LiFeng 已提交
170
static bool is_same_process(const container_t *cont, const container_pid_t *pid_info)
O
overweight 已提交
171
{
172
    if (pid_info->pid == cont->state->state->pid && pid_info->ppid == cont->state->state->p_pid &&
L
LiFeng 已提交
173 174 175
        pid_info->start_time == cont->state->state->start_time &&
        pid_info->pstart_time == cont->state->state->p_start_time) {
        return true;
O
overweight 已提交
176
    }
L
LiFeng 已提交
177
    return false;
O
overweight 已提交
178 179
}

D
dogsheng 已提交
180 181 182
static void try_to_set_paused_container_pid(Container_Status status, const container_t *cont,
                                            const container_pid_t *pid_info)
{
L
LiFeng 已提交
183
    if (status != CONTAINER_STATUS_RUNNING || !is_same_process(cont, pid_info)) {
D
dogsheng 已提交
184 185 186 187
        state_set_running(cont->state, pid_info, false);
    }
}

188
static void try_to_set_container_running(Container_Status status, container_t *cont, const container_pid_t *pid_info)
O
overweight 已提交
189
{
L
LiFeng 已提交
190 191
    if (status != CONTAINER_STATUS_RUNNING || !is_same_process(cont, pid_info)) {
        state_set_running(cont->state, pid_info, true);
O
overweight 已提交
192 193 194 195 196 197
    }
}

static int restore_stopped_container(Container_Status status, const container_t *cont, bool *need_save)
{
    const char *id = cont->common_config->id;
L
LiFeng 已提交
198
    pid_t pid = 0;
O
overweight 已提交
199

200
    if (status != CONTAINER_STATUS_STOPPED && status != CONTAINER_STATUS_CREATED) {
L
LiFeng 已提交
201 202 203 204
        if (util_process_alive(cont->state->state->pid, cont->state->state->start_time)) {
            pid = cont->state->state->pid;
        }
        int nret = post_stopped_container_to_gc(id, cont->runtime, cont->state_path, pid);
O
overweight 已提交
205 206 207
        if (nret != 0) {
            ERROR("Failed to post container %s to garbage"
                  "collector, that may lost some resources"
208 209
                  "used with container!",
                  id);
O
overweight 已提交
210 211 212 213 214 215 216 217 218
        }
        state_set_stopped(cont->state, 255);
        *need_save = true;
    }

    return 0;
}

static int restore_running_container(Container_Status status, container_t *cont,
219
                                     const struct runtime_container_status_info *info)
O
overweight 已提交
220 221
{
    int ret = 0;
L
LiFeng 已提交
222
    int nret = 0;
O
overweight 已提交
223
    const char *id = cont->common_config->id;
L
LiFeng 已提交
224
    container_pid_t pid_info = { 0 };
O
overweight 已提交
225

L
LiFeng 已提交
226 227 228 229 230 231
    nret = container_read_proc(info->pid, &pid_info);
    if (nret == 0) {
        try_to_set_container_running(status, cont, &pid_info);
    } else {
        ERROR("Failed to restore container:%s due to unable to read container pid information", id);
        nret = post_stopped_container_to_gc(id, cont->runtime, cont->state_path, 0);
O
overweight 已提交
232 233 234
        if (nret != 0) {
            ERROR("Failed to post container %s to garbage"
                  "collector, that may lost some resources"
235 236
                  "used with container!",
                  id);
O
overweight 已提交
237 238 239 240
        }
        ret = -1;
        goto out;
    }
L
LiFeng 已提交
241

O
overweight 已提交
242 243 244 245 246 247
    container_reset_manually_stopped(cont);

out:
    return ret;
}

D
dogsheng 已提交
248
static int restore_paused_container(Container_Status status, container_t *cont,
249
                                    const struct runtime_container_status_info *info)
D
dogsheng 已提交
250 251
{
    int ret = 0;
L
LiFeng 已提交
252
    int nret = 0;
D
dogsheng 已提交
253
    const char *id = cont->common_config->id;
L
LiFeng 已提交
254
    container_pid_t pid_info = { 0 };
D
dogsheng 已提交
255 256 257

    state_set_paused(cont->state);

L
LiFeng 已提交
258 259 260 261 262 263
    nret = container_read_proc(info->pid, &pid_info);
    if (nret == 0) {
        try_to_set_paused_container_pid(status, cont, &pid_info);
    } else {
        ERROR("Failed to restore container:%s due to unable to read container pid information", id);
        nret = post_stopped_container_to_gc(id, cont->runtime, cont->state_path, 0);
D
dogsheng 已提交
264 265 266
        if (nret != 0) {
            ERROR("Failed to post container %s to garbage"
                  "collector, that may lost some resources"
267 268
                  "used with container!",
                  id);
D
dogsheng 已提交
269 270 271 272
        }
        ret = -1;
        goto out;
    }
L
LiFeng 已提交
273

D
dogsheng 已提交
274 275 276 277 278 279
    container_reset_manually_stopped(cont);

out:
    return ret;
}

O
overweight 已提交
280
/* restore state */
L
LiFeng 已提交
281
static int restore_state(container_t *cont)
O
overweight 已提交
282 283
{
    int ret = 0;
L
LiFeng 已提交
284
    int nret = 0;
O
overweight 已提交
285 286
    bool need_save = false;
    const char *id = cont->common_config->id;
L
LiFeng 已提交
287 288
    const char *runtime = cont->runtime;
    rt_status_params_t params = { 0 };
289
    struct runtime_container_status_info real_status = { 0 };
290
    Container_Status status = state_get_status(cont->state);
O
overweight 已提交
291 292 293 294 295

    (void)container_exit_on_next(cont); /* cancel restart policy */

    if (check_container_image_exist(cont) != 0) {
        ERROR("Failed to restore container:%s due to image not exist", id);
L
LiFeng 已提交
296
        post_nonexist_image_containers(cont, status, &real_status);
O
overweight 已提交
297 298 299 300
        ret = -1;
        goto out;
    }

301
    params.rootpath = cont->root_path;
302
    params.state = cont->state_path;
303 304
    nret = runtime_status(id, runtime, &params, &real_status);
    if (nret != 0) {
305 306
        ERROR("Failed to restore container %s, make real status to STOPPED. Due to can not load container with status %d",
              id, status);
307
        real_status.status = RUNTIME_CONTAINER_STATUS_STOPPED;
308 309
    }

310
    if (real_status.status == RUNTIME_CONTAINER_STATUS_STOPPED) {
O
overweight 已提交
311 312 313 314
        ret = restore_stopped_container(status, cont, &need_save);
        if (ret != 0) {
            goto out;
        }
315
    } else if (real_status.status == RUNTIME_CONTAINER_STATUS_RUNNING) {
L
LiFeng 已提交
316
        ret = restore_running_container(status, cont, &real_status);
O
overweight 已提交
317 318 319
        if (ret != 0) {
            goto out;
        }
320
    } else if (real_status.status == RUNTIME_CONTAINER_STATUS_PAUSED) {
L
LiFeng 已提交
321
        ret = restore_paused_container(status, cont, &real_status);
D
dogsheng 已提交
322 323 324
        if (ret != 0) {
            goto out;
        }
O
overweight 已提交
325
    } else {
L
LiFeng 已提交
326 327 328
        ERROR("Container %s get invalid status %d", id, real_status.status);
        ret = -1;
        goto out;
O
overweight 已提交
329 330
    }

331
out:
O
overweight 已提交
332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350
    if (is_removal_in_progress(cont->state)) {
        state_reset_removal_in_progress(cont->state);
        need_save = true;
    }
    if (need_save && container_to_disk(cont) != 0) {
        ERROR("Failed to re-save container \"%s\" to disk", id);
        ret = -1;
    }
    return ret;
}

/* remove invalid container */
static int remove_invalid_container(const container_t *cont, const char *runtime, const char *root, const char *state,
                                    const char *id)
{
    int ret = 0;
    char container_root[PATH_MAX] = { 0x00 };
    char container_state[PATH_MAX] = { 0x00 };

O
openeuler-iSula 已提交
351 352
    ret = snprintf(container_state, sizeof(container_state), "%s/%s", state, id);
    if (ret < 0 || (size_t)ret >= sizeof(container_state)) {
O
overweight 已提交
353 354 355 356 357 358 359 360 361 362 363
        ERROR("Failed to sprintf container state %s/%s", state, id);
        ret = -1;
        goto out;
    }
    ret = util_recursive_rmdir(container_state, 0);
    if (ret != 0) {
        ERROR("Failed to delete container's state directory %s", container_state);
        ret = -1;
        goto out;
    }

H
haozi007 已提交
364 365 366 367 368 369 370
    ret = cleanup_mounts_by_id(id, root);
    if (ret != 0) {
        ERROR("Failed to clean container's mounts");
        ret = -1;
        goto out;
    }

O
openeuler-iSula 已提交
371 372
    ret = snprintf(container_root, sizeof(container_root), "%s/%s", root, id);
    if (ret < 0 || (size_t)ret >= sizeof(container_root)) {
O
overweight 已提交
373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403
        ERROR("Failed to sprintf invalid root directory %s/%s", root, id);
        ret = -1;
        goto out;
    }

    if (cont != NULL && im_remove_container_rootfs(cont->common_config->image_type, id)) {
        ERROR("Failed to remove rootfs for container %s", id);
        ret = -1;
        goto out;
    }

    ret = util_recursive_rmdir(container_root, 0);
    if (ret != 0) {
        ERROR("Failed to delete container's state directory %s", container_state);
        ret = -1;
        goto out;
    }
out:
    return ret;
}

static void restored_restart_container(container_t *cont)
{
    char *id = NULL;
    char *started_at = NULL;
    uint64_t timeout = 0;

    id = cont->common_config->id;

    started_at = state_get_started_at(cont->state);
    if (restart_manager_should_restart(id, state_get_exitcode(cont->state),
404
                                       cont->common_config->has_been_manually_stopped, time_seconds_since(started_at),
O
overweight 已提交
405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437
                                       &timeout)) {
        cont->common_config->restart_count++;
        INFO("Restart container %s after 5 second", id);
        (void)container_restart_in_thread(id, 5ULL * Time_Second, (int)state_get_exitcode(cont->state));
    }
    free(started_at);
}

/* handle restored container */
static void handle_restored_container()
{
    int ret = 0;
    size_t i = 0;
    size_t container_num = 0;
    char *id = NULL;
    container_t **conts = NULL;
    container_t *cont = NULL;

    ret = containers_store_list(&conts, &container_num);
    if (ret != 0) {
        ERROR("query all containers info failed");
        return;
    }

    for (i = 0; i < container_num; i++) {
        cont = conts[i];
        container_lock(cont);

        (void)reset_restart_manager(cont, false);

        id = cont->common_config->id;

        if (is_running(cont->state)) {
L
LiFeng 已提交
438
            if (restore_supervisor(cont)) {
O
overweight 已提交
439 440 441 442 443 444 445
                ERROR("Failed to restore %s supervisor", id);
            }
            init_health_monitor(id);
        } else {
            if (cont->hostconfig != NULL && cont->hostconfig->auto_remove_bak) {
                (void)set_container_to_removal(cont);
                container_unlock(cont);
L
lifeng68 已提交
446
                (void)delete_container(cont, true);
O
overweight 已提交
447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462
                container_lock(cont);
            } else {
                restored_restart_container(cont);
            }
        }

        container_unlock(cont);
        container_unref(cont);
    }

    free(conts);
    return;
}

/* scan dir to add store */
static void scan_dir_to_add_store(const char *runtime, const char *rootpath, const char *statepath,
L
LiFeng 已提交
463
                                  const size_t subdir_num, const char **subdir)
O
overweight 已提交
464 465 466 467 468 469 470 471 472 473 474 475 476 477
{
    size_t i = 0;
    container_t *cont = NULL;

    for (i = 0; i < subdir_num; i++) {
        cont = NULL;
        bool aret = false;
        bool index_flag = false;
        cont = container_load(runtime, rootpath, statepath, subdir[i]);
        if (cont == NULL) {
            ERROR("Failed to load subdir:%s", subdir[i]);
            goto error_load;
        }

L
LiFeng 已提交
478
        if (restore_state(cont)) {
O
overweight 已提交
479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536
            WARN("Failed to restore container %s state", subdir[i]);
            goto error_load;
        }

        index_flag = name_index_add(cont->common_config->name, cont->common_config->id);
        if (!index_flag) {
            ERROR("Failed add %s into name indexs", subdir[i]);
            goto error_load;
        }
        aret = containers_store_add(cont->common_config->id, cont);
        if (!aret) {
            ERROR("Failed add container %s to store", subdir[i]);
            goto error_load;
        }

        continue;
error_load:
        if (remove_invalid_container(cont, runtime, rootpath, statepath, subdir[i])) {
            ERROR("Failed to delete subdir:%s", subdir[i]);
        }
        container_unref(cont);

        if (index_flag) {
            name_index_remove(subdir[i]);
        }
        continue;
    }
}

/* restore container by runtime */
static int restore_container_by_runtime(const char *runtime)
{
    int ret = 0;
    char *rootpath = NULL;
    char *statepath = NULL;
    size_t subdir_num = 0;
    char **subdir = NULL;

    rootpath = conf_get_routine_rootdir(runtime);
    if (rootpath == NULL) {
        ERROR("Root path is NULL");
        ret = -1;
        goto out;
    }

    statepath = conf_get_routine_statedir(runtime);
    if (statepath == NULL) {
        ERROR("State path is NULL");
        ret = -1;
        goto out;
    }

    ret = util_list_all_subdir(rootpath, &subdir);
    if (ret != 0) {
        ERROR("Failed to read %s'subdirectory", rootpath);
        ret = -1;
        goto out;
    }
Z
zhuchunyi 已提交
537
    subdir_num = util_array_len((const char **)subdir);
O
overweight 已提交
538 539 540 541
    if (subdir_num == 0) {
        goto out;
    }

L
LiFeng 已提交
542
    scan_dir_to_add_store(runtime, rootpath, statepath, subdir_num, (const char **)subdir);
O
overweight 已提交
543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570

out:
    free(rootpath);
    free(statepath);
    util_free_array(subdir);
    return ret;
}

/* containers restore */
void containers_restore(void)
{
    int ret = 0;
    size_t subdir_num = 0;
    size_t i = 0;
    char *engines_path = NULL;
    char **subdir = NULL;

    engines_path = conf_get_engine_rootpath();
    if (engines_path == NULL) {
        ERROR("Failed to get engines path");
        goto out;
    }

    ret = util_list_all_subdir(engines_path, &subdir);
    if (ret != 0) {
        ERROR("Failed to list engines");
        goto out;
    }
Z
zhuchunyi 已提交
571
    subdir_num = util_array_len((const char **)subdir);
O
overweight 已提交
572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587

    for (i = 0; i < subdir_num; i++) {
        DEBUG("Restore the containers by runtime:%s", subdir[i]);
        ret = restore_container_by_runtime(subdir[i]);
        if (ret != 0) {
            ERROR("Failed to restore containers by runtime:%s", subdir[i]);
        }
    }

    handle_restored_container();

out:
    free(engines_path);
    util_free_array(subdir);
    return;
}