excepthandler.py 9.31 KB
Newer Older
1
# -*- coding: utf-8 -*-
2
3
4
5
6
"""
This module defines a context in which we can run actions that are likely to
fail because they have intricate dependencies e.g. network connections,
file access, parsing certificates and validating their chains, etc., without
stopping execution of the application. Additionally it will log these errors
7
and depending on the nature of the error reschedule the task at a time that
8
9
seems reasonable, i.e.: we can reasonably expect the issue to be resolved by
that time.
10
11
12
13
14
15
16
17

It is generally considered bad practice to catch all remaining exceptions,
however this is a daemon. We can't afford it to get stuck or crashed. So in the
interest of staying alive, if an exception is not caught specifically, the
handler will catch it, generate a stack trace and save if in a file in the
current working directory. A log entry will be created explaining that there
was an exception, inform about the location of the stack trace dump and that
the context will be dropped. It will also kindly request the administrator to
Chris Snijder's avatar
Chris Snijder committed
18
contact the developers so the exception can be caught in a future release which
19
will probably increase stability and might result in a retry rather than just
Chris Snijder's avatar
Chris Snijder committed
20
dropping the context.
21
22
23
24
25
26
27
28
29
30
31

Dropping the context effectively means that a retry won't occur and since the
context will have no more references, it will be garbage collected.
There is however still a reference to the certificate model in
:attr:`core.daemon.run.models`. With no scheduled actions it will
just sit idle, until the finder detects that it is either removed – which will
cause the entry in :attr:`core.daemon.run.models` to be deleted, or
it is changed. If the certificate file is changed the finder will schedule
schedule a parsing action for it and it will be picked up again. Hopefully the
issue that caused the uncaught exception will be resolved, if not, if will be
caught again and the cycle continues.
32
33
34
35
"""

from contextlib import contextmanager
import datetime
36
37
import logging
import os
38
import errno
39
import traceback
40
41
42
43
44
45
46
from stapled.core.exceptions import OCSPBadResponse
from stapled.core.exceptions import RenewalRequirementMissing
from stapled.core.exceptions import CertFileAccessError
from stapled.core.exceptions import CertParsingError
from stapled.core.exceptions import CertValidationError
from stapled.core.exceptions import StapleAdderBadResponse
from stapled.core.exceptions import SocketError
47
from urllib.error import URLError
48

49
LOG = logging.getLogger(__name__)
Maarten de Waard's avatar
Maarten de Waard committed
50

51
#: This is a global variable that is overridden by stapled.__main__ with
52
#: the command line argument: ``--logdir``
53
LOG_DIR = "/var/log/stapled/"
Maarten de Waard's avatar
Maarten de Waard committed
54

55
STACK_TRACE_FILENAME = "stapled_exception{:%Y%m%d-%H%M%s%f}.trace"
56
57
58


@contextmanager
59
def stapled_except_handle(ctx=None):
60
61
62
    """
    Handle lots of potential errors and reschedule failed action contexts.
    """
Chris Snijder's avatar
Chris Snijder committed
63
    # pylint: disable=too-many-branches,too-many-statements
64
    try:
65
        yield  # do the "with stapled_except_handle(ctx):" code block
66
    except (CertFileAccessError, StapleAdderBadResponse) as exc:
Chris Snijder's avatar
Chris Snijder committed
67
68
        # Can't access the certificate file or the response from HAPRoxy was
        # not "OCSP Response updated", we can try again a bit later..
69
        err_count = ctx.set_last_exception(str(exc))
70
        if err_count < 4:
71
            LOG.error(exc)
72
            ctx.reschedule(60 * err_count)  # every err_count minutes
73
        elif err_count < 7:
74
75
            LOG.error(exc)
            ctx.reschedule(3600)  # every hour
76
        else:
77
            LOG.critical("%s, giving up..", exc)
Chris Snijder's avatar
Chris Snijder committed
78
    except (SocketError, BrokenPipeError) as exc:
79
        # This is a fatal exception that can occur during initialisation of a
80
        # StapleAdder or when an StapleAdder uses a socket that consistently has a
81
        # broken pipe
82
        LOG.critical(exc)
83
84
85
    except (RenewalRequirementMissing,
            CertValidationError,
            CertParsingError) as exc:
86
        # Can't parse or validate the certificate file, or a requirement for
87
88
        # OCSP renewal is missing.
        # We can't do anything until the certificate file is changed which
Chris Snijder's avatar
Typo    
Chris Snijder committed
89
        # means we should not reschedule, when the certificate file changes,
90
        # the certfinder will add it to the parsing queue anyway..
91
92
93
94
95
        if isinstance(exc, CertValidationError):
            # If the certificate validation failed, we probably better not
            # serve the staple because it may make the server unavailable,
            # while not serving it only makes things slightly slower.
            delete_ocsp_for_context(ctx)
96
97
98
99
100
101
102
        if isinstance(exc, CertParsingError):
            if exc.log_level:
                LOG.log(exc.log_level, exc)
            else:
                LOG.critical(exc)
        else:
            LOG.critical(exc)
103
    except OCSPBadResponse as exc:
104
105
106
107
108
109
110
111
112
113
114
115
        # The OCSP response is empty, invalid or the status is not "good", we
        # can try again, maybe there's server side problem.
        err_count = ctx.set_last_exception(str(exc))
        if err_count < 4:
            LOG.error(exc)
            ctx.reschedule(60 * err_count)  # every err_count minutes
        elif err_count < 7:
            LOG.error(exc)
            ctx.reschedule(3600)  # every hour
        else:
            LOG.critical(exc)
            ctx.reschedule(43200)  # twice a day
116
117
118
119
120
121
    except (URLError) as exc:
        LOG.error(
            "Can't open URL: %s, reason: %s",
            ctx.model.ocsp_urls[ctx.model.url_index],
            exc.reason
        )
122
123
124
125
126
127

        # Iterate over the available OCSP URLs while rescheduling
        len_ocsp_urls = len(ctx.model.ocsp_urls)
        ctx.model.url_index += 1
        if ctx.model.url_index >= len_ocsp_urls:
            ctx.model.url_index = 0
128

129
130
131
132
133
134
        # Reschedule every 10 seconds (3x), then every hour (3x),
        # then twice a day, *per* URL so if we have 3 urls we will run:
        #  - every 10 seconds (9x), 3 per url
        #  - every hour (9x), 3 per url
        #  - twice a day per url
        err_count = ctx.set_last_exception(str(exc))
135
        if err_count < (3 * len_ocsp_urls) + 1:
136
            ctx.reschedule(10)  # every err_count minutes
137
        elif err_count < (6 * len_ocsp_urls) + 1:
138
139
140
            ctx.reschedule(3600)  # every hour
        else:
            ctx.reschedule(43200 // len_ocsp_urls)  # twice a day per url
141
142
143
144
145
        LOG.debug(
            "This exception %d in a row, context.model.ocsp_urls has %d "
            "entries",
            err_count, len_ocsp_urls
        )
146
    except (IOError, OSError) as exc:
147
148
        LOG.critical(handle_file_error(exc))

149

Chris Snijder's avatar
Chris Snijder committed
150
151
    # the show must go on..
    except Exception as exc:  # pylint: disable=broad-except
152
153
        dump_stack_trace(ctx, exc)

154
def handle_file_error(exc):
155
156
157
158
159
160
161
    """
    Wrapper for handling IOError and OSError logging..

    Can't use FileNotFoundError and PermissionError because they don't exist in
    Python 2.7.x yet. This won't be required after we remove Python 2.7.x
    support.
    :param Exception exc: OSError or IOError to handle logging for.
162
    :return str: Reason for OSError/IOError.
163
164
    """
    if exc.errno == errno.EPERM or exc.errno == errno.EACCES:
165
        reason = "Permission error"
166
    elif exc.errno == errno.ENOENT:
167
168
169
170
171
        reason = "File not found error"
    elif isinstance(exc, IOError):
        reason = "I/O Error"
    else:
        reason = "OS Error"
172
    return "{}: {}".format(reason, str(exc))
173

174

175
def delete_ocsp_for_context(ctx):
176
177
178
179
    """
    When something bad happens, sometimes it is good to delete a related bad
    OCSP file so it can't be served any more.

180
181
    .. TODO:: Check that HAProxy doesn't cache this, it probably does, we need
        to be able to tell it not to remember it.
182
    """
183
    LOG.info("Zero-ing any OCSP staple: \"%s.ocsp\" if it exists.", ctx.model)
184
    try:
185
        ocsp_file = "{}.ocsp".format(ctx.model.filename)
186
187
        with open(ocsp_file, 'w') as ocsp_file_obj:
            ocsp_file_obj.write("")
188
    except (IOError, OSError) as exc:
189
        LOG.debug(
190
191
192
            "Can't replace OCSP staple \"%s\" by an empty stub, reason: %s",
            ocsp_file,
            handle_file_error(exc)
193
        )
194

195
196
197
198
199
200
201
202

def dump_stack_trace(ctx, exc):
    """
    Examine the last exception and dump a stack trace to a file, if it fails
    due to an IOError or OSError, log that it failed so the a sysadmin
    may make the directory writeable.
    """
    trace_file = STACK_TRACE_FILENAME.format(datetime.datetime.now())
Maarten de Waard's avatar
Maarten de Waard committed
203
    trace_file = os.path.join(LOG_DIR, trace_file)
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
    try:
        with open(trace_file, "w") as file_handle:
            traceback.print_exc(file=file_handle)
        LOG.critical(
            "Prevented thread from being killed by uncaught exception: %s\n"
            "Context %s will be dropped as a result of the exception.\n"
            "A stack trace has been saved in %s\n"
            "Please report this error to the developers so the exception can "
            "be handled in a future release, thank you!",
            exc,
            ctx,
            trace_file
        )
    except (IOError, OSError) as trace_exc:
        LOG.critical(
            "Prevented thread from being killed by uncaught exception: %s\n"
            "Context %s will be dropped as a result of the exception.\n"
            "Couldn't dump stack trace to: %s reason: %s\n"
            "Please report this error to the developers so the exception can "
            "be handled in a future release, thank you!",
            exc,
            ctx,
            trace_file,
            trace_exc
        )