mirror of
https://github.com/certbot/certbot.git
synced 2026-01-27 19:42:53 +03:00
335 lines
16 KiB
Python
335 lines
16 KiB
Python
#!/usr/bin/env python
|
|
|
|
# This daemon runs on the CA side to look for requests in
|
|
# the database that are waiting for actions to be taken:
|
|
# generating challenges, testing whether challenges have
|
|
# been met, and issuing certs when the challenges have been
|
|
# met. The daemon does not communicate with the client at
|
|
# all; it just notes changes to request state in the database,
|
|
# which the server will inform the client about when the
|
|
# client subsequently checks in.
|
|
|
|
# The queue mechanism with pending-* is supposed to control
|
|
# concurrency issues properly, but this needs verification
|
|
# to ensure that there are no possible race conditions.
|
|
# Generally, the server process (as distinct from the daemon)
|
|
# is not supposed to change sessions at all once they have
|
|
# been added to a queue, except for marking them no longer
|
|
# live if the server realizes that something bad has happened
|
|
# to them. There may be some exceptions, and they should all
|
|
# be analyzed for possible races.
|
|
|
|
# TODO: check sessions' internal evidence for consistency
|
|
# with their queue membership (in case of crashes or bugs).
|
|
# In particular, check that a session in pending-makechallenge
|
|
# does not actually contain any challenges and that a
|
|
# session in pending-issue does not actually contain an
|
|
# issued cert.
|
|
# TODO: write queue rebuilding script that uses sessions'
|
|
# internal state to decide which queue they go in (to
|
|
# run when starting daemon, in case there was a crash
|
|
# that caused a session not to be in any pending queue
|
|
# because the daemon was actively working on it during
|
|
# the crash); consider marking sessions "dirty" when
|
|
# beginning to actually modify their contents in order
|
|
# to allow dirty sessions to be deleted after a crash instead
|
|
# of placing them back on a queue. Or, we could just
|
|
# decide that a crash invalidates each and every pending
|
|
# request, period, while still allowing clients to look
|
|
# up successfully issued certs.
|
|
# TODO: implement multithreading to allow several parallel
|
|
# worker processes.
|
|
#
|
|
# NOTE: The daemon enforces its own timeouts, which are
|
|
# defined in the ancient() function. These timeouts apply
|
|
# to any session that has been placed in a queue and can
|
|
# be completely independent of the session timeout policy
|
|
# in the server. Being marked as dead at any time by either
|
|
# the server or the daemon (due to timeout or error) causes
|
|
# a session to be treated as dead by both.
|
|
|
|
import redis, redis_lock, time, CSR, sys, signal, binascii
|
|
from sni_challenge.verify import verify_challenge
|
|
from Crypto import Random
|
|
|
|
r = redis.Redis()
|
|
ps = r.pubsub()
|
|
issue_lock = redis_lock.redis_lock(r, "issue_lock")
|
|
# This lock guards the ability to issue certificates with "openssl ca",
|
|
# which has no locking of its own. We don't need locking for the updates
|
|
# that the daemon performs on the sessions in the database because the
|
|
# queues pending-makechallenge, pending-testchallenge, and pending-issue
|
|
# are updated atomically and the daemon only ever acts on sessions that it
|
|
# has removed from a queue.
|
|
# TODO: in a deployed system, the queue for issuing certs should probably
|
|
# be treated a first-come, first-issue fashion, so that a request doesn't
|
|
# time out while waiting to acquire the lock just because other requests
|
|
# happened to get it first. Another way of putting this is that there
|
|
# could be only one thread/process that deals with pending-issue sessions,
|
|
# even though there could be many that deal with pending-makechallenge and
|
|
# pending-testchallenge. Then we can guarantee that the oldest pending-issue
|
|
# requests are dealt with first, which is impossible to guarantee when
|
|
# multiple daemons may be opportunistically acquiring this lock.
|
|
|
|
debug = "debug" in sys.argv
|
|
clean_shutdown = False
|
|
|
|
def signal_handler(a, b):
|
|
global clean_shutdown
|
|
clean_shutdown = True
|
|
r.publish("exit", "clean-exit")
|
|
|
|
signal.signal(signal.SIGTERM, signal_handler)
|
|
signal.signal(signal.SIGINT, signal_handler)
|
|
|
|
def short(session):
|
|
"""Return the first 12 bytes of a session ID, or, for a
|
|
challenge ID, the challenge ID with the session ID truncated."""
|
|
tmp = session.partition(":")
|
|
return tmp[0][:12] + "..." + tmp[1] + tmp[2]
|
|
|
|
def ancient(session, state):
|
|
"""Given that this session is in the specified named state,
|
|
decide whether the daemon should forcibly expire it for being too
|
|
old, even if no client request has caused the serve to mark the
|
|
session as expired. This is most relevant to truly abandoned
|
|
sessions that no client ever asks about."""
|
|
age = int(time.time()) - int(r.hget(session, "created"))
|
|
if state == "makechallenge" and age > 120:
|
|
if debug: print "considered", short(session), "ancient"
|
|
return True
|
|
if state == "testchallenge" and age > 600:
|
|
if debug: print "considered", short(session), "ancient"
|
|
return True
|
|
return False
|
|
|
|
def random():
|
|
"""Return 64 hex digits representing a new 32-byte random number."""
|
|
return binascii.hexlify(Random.get_random_bytes(32))
|
|
|
|
def random_raw():
|
|
"""Return 32 random bytes."""
|
|
return Random.get_random_bytes(32)
|
|
|
|
def makechallenge(session):
|
|
if r.hget(session, "live") != "True":
|
|
# This session has died due to some other reason, like an
|
|
# illegal request or timeout, since it entered makechallenge
|
|
# state. Consequently, we're not allowed to advance its
|
|
# state any further, and it should be removed from the
|
|
# pending-requests queue and not pushed into any other queue.
|
|
# We don't have to remove it from pending-makechallenge
|
|
# because the caller has already done so.
|
|
if debug: print "removing expired session", short(session)
|
|
r.lrem("pending-requests", session)
|
|
return
|
|
# Currently only makes challenges of type 0 (DomainValidateSNI)
|
|
# This challenge type has three internal data parameters:
|
|
# dvsni:nonce, dvsni:r, dvsni:ext
|
|
# This challenge type sends three data parameters to the client:
|
|
# nonce, y = E(r), ext
|
|
#
|
|
# Make one challenge for each name. (This one-to-one relationship
|
|
# is not an inherent protocol requirement!)
|
|
names = r.lrange("%s:names" % session, 0, -1)
|
|
if debug: print "%s: new valid request" % session
|
|
if debug: print "%s: from requesting client at %s" % (short(session), r.hget(session, "client-addr"))
|
|
if debug: print "%s: for %d names: %s" % (short(session), len(names), ", ".join(names))
|
|
for i, name in enumerate(names):
|
|
challenge = "%s:%d" % (session, i)
|
|
r.hset(challenge, "challtime", int(time.time()))
|
|
r.hset(challenge, "type", 0) # DomainValidateSNI
|
|
r.hset(challenge, "name", name)
|
|
r.hset(challenge, "satisfied", False)
|
|
r.hset(challenge, "failed", False)
|
|
r.hset(challenge, "dvsni:nonce", random())
|
|
r.hset(challenge, "dvsni:r", random_raw())
|
|
r.hset(challenge, "dvsni:ext", "1.3.3.7")
|
|
# Keep accurate count of how many challenges exist in this session.
|
|
r.hincrby(session, "challenges", 1)
|
|
if debug: print "created new challenge", short(challenge)
|
|
if True: # challenges have been created
|
|
r.hset(session, "state", "testchallenge")
|
|
r.lpush("pending-testchallenge", session)
|
|
# TODO: this causes the daemon to immediately attempt to test the
|
|
# challenge for completion, with no delay.
|
|
r.publish("requests", "testchallenge")
|
|
else:
|
|
r.lpush("pending-makechallenge", session)
|
|
r.publish("requests", "makechallenge")
|
|
|
|
def testchallenge(session):
|
|
if r.hget(session, "live") != "True":
|
|
# This session has died due to some other reason, like an
|
|
# illegal request or timeout, since it entered testchallenge
|
|
# state. Consequently, we're not allowed to advance its
|
|
# state any further, and it should be removed from the
|
|
# pending-requests queue and not pushed into any other queue.
|
|
# We don't have to remove it from pending-testchallenge
|
|
# because the caller has already done so.
|
|
if debug: print "removing expired session", short(session)
|
|
r.lrem("pending-requests", session)
|
|
return
|
|
# Note that we can push this back into the original queue.
|
|
# TODO: need to add a way to make sure we don't test the same
|
|
# session too often.
|
|
# Conceivably, this could wait until the client announces
|
|
# that it has completed the challenges. Information about
|
|
# the client's reporting could be stored in the database.
|
|
# Then the CA doesn't need to poll prematurely.
|
|
all_satisfied = True
|
|
for i, name in enumerate(r.lrange("%s:names" % session, 0, -1)):
|
|
challenge = "%s:%d" % (session, i)
|
|
if debug: print "testing challenge", short(challenge)
|
|
challtime = int(r.hget(challenge, "challtime"))
|
|
challtype = int(r.hget(challenge, "type"))
|
|
name = r.hget(challenge, "name")
|
|
satisfied = r.hget(challenge, "satisfied") == "True"
|
|
failed = r.hget(challenge, "failed") == "True"
|
|
# TODO: check whether this challenge is too old
|
|
if not satisfied and not failed:
|
|
# if debug: print "challenge", short(challenge), "being tested"
|
|
if challtype == 0: # DomainValidateSNI
|
|
if debug: print "\tbeginning dvsni test to %s" % name
|
|
dvsni_nonce = r.hget(challenge, "dvsni:nonce")
|
|
dvsni_r = r.hget(challenge, "dvsni:r")
|
|
dvsni_ext = r.hget(challenge, "dvsni:ext")
|
|
direct_result, direct_reason = verify_challenge(name, dvsni_r, dvsni_nonce, False)
|
|
proxy_result, proxy_reason = verify_challenge(name, dvsni_r, dvsni_nonce, True)
|
|
if debug:
|
|
print "\t...direct probe: %s (%s)" % (direct_result, direct_reason)
|
|
print "\tTor proxy probe: %s (%s)" % (proxy_result, proxy_reason)
|
|
if direct_result and proxy_result:
|
|
r.hset(challenge, "satisfied", True)
|
|
else:
|
|
all_satisfied = False
|
|
# TODO: distinguish permanent and temporarily failures
|
|
# can cause a permanent failure under some conditions, causing
|
|
# the session to become dead. TODO: need to articulate what
|
|
# those conditions are
|
|
else:
|
|
# Don't know how to handle this challenge type
|
|
all_satisfied = False
|
|
elif not satisfied:
|
|
if debug: print "\tchallenge was not attempted"
|
|
all_satisfied = False
|
|
if all_satisfied:
|
|
# Challenges all succeeded, so we should prepare to issue
|
|
# the requested cert.
|
|
# TODO: double-check that there were > 0 challenges,
|
|
# so that we don't somehow mistakenly issue a cert in
|
|
# response to an empty list of challenges (even though
|
|
# the daemon that put this session on the queue should
|
|
# also have implicitly guaranteed this).
|
|
if debug: print "\t** All challenges satisfied; request %s GRANTED" % short(session)
|
|
r.hset(session, "state", "issue")
|
|
r.lpush("pending-issue", session)
|
|
r.publish("requests", "issue")
|
|
else:
|
|
# Some challenges are not verified.
|
|
# Put this session back on the stack to try to verify again.
|
|
r.lpush("pending-testchallenge", session)
|
|
# TODO: if we wanted the client to tell us when it believes
|
|
# it has completed the challenge, we should take this out and
|
|
# have the server publish the message in response to the message
|
|
# from the client. Also, the current version will cause the
|
|
# server to retest over and over again as fast as it's able.
|
|
r.publish("requests", "testchallenge")
|
|
|
|
def issue(session):
|
|
if r.hget(session, "live") != "True":
|
|
# This session has died due to some other reason, like an
|
|
# illegal request or timeout, since it entered testchallenge
|
|
# state. Consequently, we're not allowed to advance its
|
|
# state any further, and it should be removed from the
|
|
# pending-requests queue and not pushed into any other queue.
|
|
# We don't have to remove it from pending-testchallenge
|
|
# because the caller has already done so.
|
|
#
|
|
# Having a session in pending-issue die is a very weird case
|
|
# that probably suggests that timeouts are set incorrectly
|
|
# or that the client is misbehaving very badly. This means
|
|
# that a request passed all of its challenges but the
|
|
# session nonetheless died for some reason unrelated to failing
|
|
# challenges before the cert could be issued. Normally, this
|
|
# should never happen.
|
|
#
|
|
# TODO: This can definitely happen when there are extremely many
|
|
# sessions stuck in testchallenge state compared to the number of
|
|
# daemon processes to handle them, because each session in
|
|
# testchallenge gets tested once before any daemon gets around to
|
|
# issuing the cert. This is a bug.
|
|
if debug: print "removing expired (issue-state!?) session", short(session)
|
|
r.lrem("pending-requests", session)
|
|
return
|
|
csr = r.hget(session, "csr")
|
|
names = r.lrange("%s:names" % session, 0, -1)
|
|
with issue_lock:
|
|
cert = CSR.issue(csr, names)
|
|
r.hset(session, "cert", cert)
|
|
if cert: # once issuing cert succeeded
|
|
if debug: print "%s: issued certificate for names: %s" % (short(session), ", ".join(names))
|
|
r.hset(session, "state", "done")
|
|
r.lpush("pending-done", session)
|
|
# TODO: Note that we do not publish a pubsub message when
|
|
# the session enters done state, so the daemon will not
|
|
# actually act on it. Is that OK?
|
|
else: # should not be reached in deployed version
|
|
if debug: print "issuing for", short(session), "failed"
|
|
r.lpush("pending-issue", session)
|
|
r.publish("requests", "issue")
|
|
|
|
# Dispatch table for how to react to pubsub messages. The key is
|
|
# the pubsub message and the value is a tuple of (queue name, function).
|
|
# The main loop will look in the specified queue for a pending session,
|
|
# and, if it finds one, it will call the specified function on it.
|
|
# Since the queue names are systematically related to the message names,
|
|
# we could probably remove the queue name field entirely.
|
|
dispatch = { "makechallenge": ("pending-makechallenge", makechallenge),
|
|
"testchallenge": ("pending-testchallenge", testchallenge),
|
|
"issue": ("pending-issue", issue),
|
|
"done": ("pending-done", lambda x: None) }
|
|
|
|
# Main loop: act on queues notified via Redis pubsub mechanism.
|
|
# Currently, we ignore the specific details of which queue was
|
|
# notified and, upon any notification, repeatedly process a single
|
|
# item from each queue until all queues are empty.
|
|
|
|
ps.subscribe(["requests"])
|
|
ps.subscribe(["logs"])
|
|
ps.subscribe(["exit"])
|
|
for message in ps.listen():
|
|
if message["type"] != "message":
|
|
continue
|
|
if message["channel"] == "logs":
|
|
if debug: print message["data"]
|
|
continue
|
|
if message["channel"] == "exit":
|
|
break
|
|
if message["channel"] == "requests":
|
|
# populated_queue would be used by a more sophisticated scheduler
|
|
populated_queue = message["data"]
|
|
while True:
|
|
inactive = True
|
|
for queue in ("makechallenge", "testchallenge", "issue"):
|
|
if clean_shutdown:
|
|
inactive = True
|
|
break
|
|
session = r.rpop("pending-" + queue)
|
|
if session:
|
|
inactive = False
|
|
if ancient(session, queue) and queue != "issue":
|
|
if debug: print "expiring ancient session", short(session)
|
|
r.hset(session, "live", False)
|
|
else:
|
|
# if debug: print "going to %s for %s" % (queue, short(session))
|
|
if queue == "makechallenge": makechallenge(session)
|
|
elif queue == "testchallenge": testchallenge(session)
|
|
elif queue == "issue": issue(session)
|
|
if inactive:
|
|
break
|
|
|
|
if clean_shutdown:
|
|
print "daemon exiting cleanly"
|
|
break
|