import signal
from datetime import datetime, timedelta

import pytest
from freezegun import freeze_time

from dagster.core.instance import DagsterInstance
from dagster.core.scheduler import ScheduleTickStatus
from dagster.core.storage.pipeline_run import PipelineRunStatus
from dagster.core.storage.tags import PARTITION_NAME_TAG, SCHEDULED_EXECUTION_TIME_TAG
from dagster.scheduler.scheduler import launch_scheduled_runs
from dagster.seven import get_current_datetime_in_utc, get_utc_timezone, multiprocessing

from .test_scheduler_run import (
    cli_api_repo,
    grpc_repo,
    instance_with_schedules,
    validate_run_started,
    validate_tick,
    wait_for_all_runs_to_start,
)


def _test_launch_scheduled_runs_in_subprocess(instance_ref, execution_datetime, debug_crash_flags):
    with DagsterInstance.from_ref(instance_ref) as instance:
        with freeze_time(execution_datetime):
            launch_scheduled_runs(instance, get_current_datetime_in_utc(), debug_crash_flags)


@pytest.mark.parametrize("external_repo_context", [cli_api_repo, grpc_repo])
@pytest.mark.parametrize("crash_location", ["TICK_CREATED", "TICK_HELD"])
@pytest.mark.parametrize("crash_signal", [signal.SIGKILL, signal.SIGINT])
def test_failure_recovery_before_run_created(external_repo_context, crash_location, crash_signal):
    # Verify that if the scheduler crashes or is interrupted before a run is created,
    # it will create exactly one tick/run when it is re-launched
    with instance_with_schedules(external_repo_context) as (instance, external_repo):
        initial_datetime = datetime(
            year=2019, month=2, day=27, hour=0, minute=0, second=0, tzinfo=get_utc_timezone(),
        )
        external_schedule = external_repo.get_external_schedule("simple_schedule")
        with freeze_time(initial_datetime) as frozen_datetime:
            instance.start_schedule_and_update_storage_state(external_schedule)

            debug_crash_flags = {external_schedule.name: {crash_location: crash_signal}}

            scheduler_process = multiprocessing.Process(
                target=_test_launch_scheduled_runs_in_subprocess,
                args=[instance.get_ref(), get_current_datetime_in_utc(), debug_crash_flags],
            )
            scheduler_process.start()
            scheduler_process.join(timeout=60)

            assert scheduler_process.exitcode != 0

            ticks = instance.get_schedule_ticks(external_schedule.get_origin_id())
            assert len(ticks) == 1
            assert ticks[0].status == ScheduleTickStatus.STARTED

            assert instance.get_runs_count() == 0

            frozen_datetime.tick(delta=timedelta(minutes=5))

            scheduler_process = multiprocessing.Process(
                target=_test_launch_scheduled_runs_in_subprocess,
                args=[instance.get_ref(), get_current_datetime_in_utc(), None],
            )
            scheduler_process.start()
            scheduler_process.join(timeout=60)
            assert scheduler_process.exitcode == 0

            assert instance.get_runs_count() == 1
            wait_for_all_runs_to_start(instance)
            validate_run_started(instance.get_runs()[0], initial_datetime, "2019-02-26")

            ticks = instance.get_schedule_ticks(external_schedule.get_origin_id())
            assert len(ticks) == 1
            validate_tick(
                ticks[0],
                external_schedule,
                initial_datetime,
                ScheduleTickStatus.SUCCESS,
                instance.get_runs()[0].run_id,
            )


@pytest.mark.parametrize("external_repo_context", [cli_api_repo, grpc_repo])
@pytest.mark.parametrize("crash_location", ["RUN_CREATED", "RUN_LAUNCHED"])
@pytest.mark.parametrize("crash_signal", [signal.SIGKILL, signal.SIGINT])
def test_failure_recovery_after_run_created(external_repo_context, crash_location, crash_signal):
    # Verify that if the scheduler crashes or is interrupted after a run is created,
    # it will just re-launch the already-created run when it runs again
    with instance_with_schedules(external_repo_context) as (instance, external_repo):
        initial_datetime = datetime(
            year=2019, month=2, day=27, hour=0, minute=0, second=0, tzinfo=get_utc_timezone(),
        )
        external_schedule = external_repo.get_external_schedule("simple_schedule")
        with freeze_time(initial_datetime) as frozen_datetime:
            instance.start_schedule_and_update_storage_state(external_schedule)

            debug_crash_flags = {external_schedule.name: {crash_location: crash_signal}}

            scheduler_process = multiprocessing.Process(
                target=_test_launch_scheduled_runs_in_subprocess,
                args=[instance.get_ref(), get_current_datetime_in_utc(), debug_crash_flags],
            )
            scheduler_process.start()
            scheduler_process.join(timeout=60)

            assert scheduler_process.exitcode != 0

            ticks = instance.get_schedule_ticks(external_schedule.get_origin_id())
            assert len(ticks) == 1
            assert ticks[0].status == ScheduleTickStatus.STARTED

            assert instance.get_runs_count() == 1

            if crash_location == "RUN_CREATED":
                run = instance.get_runs()[0]
                # Run was created, but hasn't launched yet
                assert run.tags[SCHEDULED_EXECUTION_TIME_TAG] == initial_datetime.isoformat()
                assert run.tags[PARTITION_NAME_TAG] == "2019-02-26"
                assert run.status == PipelineRunStatus.NOT_STARTED
            else:
                # The run was created and launched - running again should do nothing other than
                # moving the tick to success state.

                # The fact that we need to add this line indicates that there is still a theoretical
                # possible race condition - if the scheduler fails after launching a run
                # and then runs again between when the run was launched and when its status is changed to STARTED by the executor, we could
                # end up launching the same run twice. Run queueing or some other way to immediately
                # identify that a run was launched would help eliminate this race condition. For now,
                # eliminate the possibility by waiting for the run to start before running the
                # scheduler again.
                wait_for_all_runs_to_start(instance)

                run = instance.get_runs()[0]
                validate_run_started(instance.get_runs()[0], initial_datetime, "2019-02-26")

                assert run.status in [PipelineRunStatus.STARTED, PipelineRunStatus.SUCCESS]

            frozen_datetime.tick(delta=timedelta(minutes=5))

            # Running again just launches the existing run and marks the tick as success
            scheduler_process = multiprocessing.Process(
                target=_test_launch_scheduled_runs_in_subprocess,
                args=[instance.get_ref(), get_current_datetime_in_utc(), None],
            )
            scheduler_process.start()
            scheduler_process.join(timeout=60)
            assert scheduler_process.exitcode == 0

            assert instance.get_runs_count() == 1
            wait_for_all_runs_to_start(instance)
            validate_run_started(instance.get_runs()[0], initial_datetime, "2019-02-26")

            ticks = instance.get_schedule_ticks(external_schedule.get_origin_id())
            assert len(ticks) == 1
            validate_tick(
                ticks[0],
                external_schedule,
                initial_datetime,
                ScheduleTickStatus.SUCCESS,
                instance.get_runs()[0].run_id,
            )


@pytest.mark.parametrize("external_repo_context", [cli_api_repo, grpc_repo])
@pytest.mark.parametrize("crash_location", ["TICK_SUCCESS"])
@pytest.mark.parametrize("crash_signal", [signal.SIGKILL, signal.SIGINT])
def test_failure_recovery_after_tick_success(external_repo_context, crash_location, crash_signal):
    # Verify that if the scheduler crashes or is interrupted after a run is created,
    # it will just re-launch the already-created run when it runs again
    with instance_with_schedules(external_repo_context) as (instance, external_repo):
        initial_datetime = datetime(
            year=2019, month=2, day=27, hour=0, minute=0, second=0, tzinfo=get_utc_timezone(),
        )
        external_schedule = external_repo.get_external_schedule("simple_schedule")
        with freeze_time(initial_datetime) as frozen_datetime:
            instance.start_schedule_and_update_storage_state(external_schedule)

            debug_crash_flags = {external_schedule.name: {crash_location: crash_signal}}

            scheduler_process = multiprocessing.Process(
                target=_test_launch_scheduled_runs_in_subprocess,
                args=[instance.get_ref(), get_current_datetime_in_utc(), debug_crash_flags],
            )
            scheduler_process.start()
            scheduler_process.join(timeout=60)

            assert scheduler_process.exitcode != 0

            # As above there's a possible race condition here if the scheduler crashes
            # and launches the same run twice if we crash right after the launch and re-run
            # before the run actually starts
            wait_for_all_runs_to_start(instance)

            assert instance.get_runs_count() == 1
            validate_run_started(instance.get_runs()[0], initial_datetime, "2019-02-26")

            ticks = instance.get_schedule_ticks(external_schedule.get_origin_id())
            assert len(ticks) == 1

            if crash_signal == signal.SIGKILL:
                validate_tick(
                    ticks[0], external_schedule, initial_datetime, ScheduleTickStatus.STARTED, None,
                )
            else:
                validate_tick(
                    ticks[0],
                    external_schedule,
                    initial_datetime,
                    ScheduleTickStatus.SUCCESS,
                    instance.get_runs()[0].run_id,
                )

            frozen_datetime.tick(delta=timedelta(minutes=5))

            # Running again just marks the tick as success since the run has already started
            scheduler_process = multiprocessing.Process(
                target=_test_launch_scheduled_runs_in_subprocess,
                args=[instance.get_ref(), get_current_datetime_in_utc(), None],
            )
            scheduler_process.start()
            scheduler_process.join(timeout=60)
            assert scheduler_process.exitcode == 0

            assert instance.get_runs_count() == 1
            validate_run_started(instance.get_runs()[0], initial_datetime, "2019-02-26")

            ticks = instance.get_schedule_ticks(external_schedule.get_origin_id())
            assert len(ticks) == 1
            validate_tick(
                ticks[0],
                external_schedule,
                initial_datetime,
                ScheduleTickStatus.SUCCESS,
                instance.get_runs()[0].run_id,
            )
