From 8701577cff4f080bbe61de24a082f5ef2d2abd33 Mon Sep 17 00:00:00 2001 From: Joseph Schorr Date: Wed, 18 Mar 2020 17:00:14 -0400 Subject: [PATCH] Have the RepositoryActionCount worker cleanup old rows in RAC (#274) We don't make use of any action counts older than a year, so this change will have the worker remove old rows, one month (roughly) at a time --- data/model/repositoryactioncount.py | 27 +++++++++++++++++++++++++++ workers/repositoryactioncounter.py | 16 ++++++++++++++-- 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/data/model/repositoryactioncount.py b/data/model/repositoryactioncount.py index 9cc715f25..9170a1c5a 100644 --- a/data/model/repositoryactioncount.py +++ b/data/model/repositoryactioncount.py @@ -31,6 +31,8 @@ SEARCH_BUCKETS = [ search_bucket(timedelta(days=183), 152, 0.71028), ] +RAC_RETENTION_PERIOD = timedelta(days=365) + def find_uncounted_repository(): """ @@ -157,3 +159,28 @@ def update_repository_score(repo): except IntegrityError: logger.debug("RepositorySearchScore row already existed; skipping") return False + + +def delete_expired_entries(repo, limit=100): + """ Deletes expired entries from the RepositoryActionCount table for a specific repository. + Returns the number of entries removed. + """ + threshold_date = datetime.today() - RAC_RETENTION_PERIOD + found = list( + RepositoryActionCount.select().where( + RepositoryActionCount.repository == repo, RepositoryActionCount.date < threshold_date + ) + ) + + if not found: + return 0 + + count_removed = 0 + for entry in found: + try: + entry.delete_instance(recursive=False) + count_removed += 1 + except IntegrityError: + continue + + return count_removed diff --git a/workers/repositoryactioncounter.py b/workers/repositoryactioncounter.py index b825145de..39b5cbb56 100644 --- a/workers/repositoryactioncounter.py +++ b/workers/repositoryactioncounter.py @@ -24,19 +24,22 @@ class RepositoryActionCountWorker(Worker): """ Counts actions and aggregates search scores for a random repository for the previous day. """ + # Select a repository that needs its actions for the last day updated. to_count = model.repositoryactioncount.find_uncounted_repository() if to_count is None: logger.debug("No further repositories to count") return False - yesterday = date.today() - timedelta(days=1) - logger.debug("Found repository #%s to count", to_count.id) + + # Count the number of actions that occurred yesterday for the repository. + yesterday = date.today() - timedelta(days=1) daily_count = logs_model.count_repository_actions(to_count, yesterday) if daily_count is None: logger.debug("Could not load count for repository #%s", to_count.id) return False + # Store the count for the repository. was_counted = model.repositoryactioncount.store_repository_action_count( to_count, yesterday, daily_count ) @@ -44,6 +47,7 @@ class RepositoryActionCountWorker(Worker): logger.debug("Repository #%s was counted by another worker", to_count.id) return False + # Update the search score for the repository now that its actions have been counted. logger.debug("Updating search score for repository #%s", to_count.id) was_updated = model.repositoryactioncount.update_repository_score(to_count) if not was_updated: @@ -53,6 +57,14 @@ class RepositoryActionCountWorker(Worker): return False logger.debug("Repository #%s search score updated", to_count.id) + + # Delete any entries older than the retention period for the repository. + while True: + found = model.repositoryactioncount.delete_expired_entries(to_count, 30) + if found <= 0: + break + + logger.debug("Repository #%s old entries removed", to_count.id) return True