1
0
mirror of https://github.com/quay/quay.git synced 2026-01-26 06:21:37 +03:00
Files
quay/data/model/repositoryactioncount.py
Kenny Lee Sin Cheong 5f63b3a7bb chore: drop deprecated tables and remove unused code (PROJQUAY-522) (#2089)
* chore: drop deprecated tables and remove unused code

* isort imports

* migration: check for table existence before drop
2023-08-25 12:17:24 -04:00

200 lines
6.1 KiB
Python

import logging
from collections import namedtuple
from datetime import date, datetime, timedelta
from peewee import JOIN, IntegrityError
from data.database import (
LogEntry,
LogEntry2,
LogEntry3,
Repository,
RepositoryActionCount,
RepositorySearchScore,
db_random_func,
fn,
)
logger = logging.getLogger(__name__)
search_bucket = namedtuple("search_bucket", ["delta", "days", "weight"])
# Defines the various buckets for search scoring. Each bucket is computed using the given time
# delta from today *minus the previous bucket's time period*. Once all the actions over the
# bucket's time period have been collected, they are multiplied by the given modifier. The modifiers
# for this bucket were determined via the integral of (2/((x/183)+1)^2)/183 over the period of days
# in the bucket; this integral over 0..183 has a sum of 1, so we get a good normalize score result.
SEARCH_BUCKETS = [
search_bucket(timedelta(days=1), 1, 0.010870),
search_bucket(timedelta(days=7), 6, 0.062815),
search_bucket(timedelta(days=31), 24, 0.21604),
search_bucket(timedelta(days=183), 152, 0.71028),
]
RAC_RETENTION_PERIOD = timedelta(days=365)
def count_repository_actions(to_count, day):
"""
Aggregates repository actions from the LogEntry table for the specified day.
Returns the count or None on error.
"""
# TODO: Clean this up a bit.
def lookup_action_count(model):
return (
model.select()
.where(
model.repository == to_count,
model.datetime >= day,
model.datetime < (day + timedelta(days=1)),
)
.count()
)
actions = (
lookup_action_count(LogEntry3)
+ lookup_action_count(LogEntry2)
+ lookup_action_count(LogEntry)
)
return actions
def found_entry_count(day):
"""
Returns the number of entries for the given day in the RAC table.
"""
return RepositoryActionCount.select().where(RepositoryActionCount.date == day).count()
def has_repository_action_count(repository, day):
"""
Returns whether there is a stored action count for a repository for a specific day.
"""
try:
RepositoryActionCount.get(repository=repository, date=day)
return True
except RepositoryActionCount.DoesNotExist:
return False
def store_repository_action_count(repository, day, action_count):
"""
Stores the action count for a repository for a specific day.
Returns False if the repository already has an entry for the specified day.
"""
try:
RepositoryActionCount.create(repository=repository, date=day, count=action_count)
return True
except IntegrityError:
logger.debug("Count already written for repository %s", repository.id)
return False
def update_repository_score(repo):
"""
Updates the repository score entry for the given table by retrieving information from the
RepositoryActionCount table.
Note that count_repository_actions for the repo should be called first. Returns True if the row
was updated and False otherwise.
"""
today = date.today()
# Retrieve the counts for each bucket and calculate the final score.
final_score = 0.0
last_end_timedelta = timedelta(days=0)
for bucket in SEARCH_BUCKETS:
start_date = today - bucket.delta
end_date = today - last_end_timedelta
last_end_timedelta = bucket.delta
query = RepositoryActionCount.select(
fn.Sum(RepositoryActionCount.count), fn.Count(RepositoryActionCount.id)
).where(
RepositoryActionCount.date >= start_date,
RepositoryActionCount.date < end_date,
RepositoryActionCount.repository == repo,
)
bucket_tuple = query.tuples()[0]
logger.debug(
"Got bucket tuple %s for bucket %s for repository %s", bucket_tuple, bucket, repo.id
)
if bucket_tuple[0] is None:
continue
bucket_sum = float(bucket_tuple[0])
bucket_count = int(bucket_tuple[1])
if not bucket_count:
continue
bucket_score = bucket_sum / (bucket_count * 1.0)
final_score += bucket_score * bucket.weight
# Update the existing repo search score row or create a new one.
normalized_score = int(final_score * 100.0)
try:
try:
search_score_row = RepositorySearchScore.get(repository=repo)
search_score_row.last_updated = datetime.now()
search_score_row.score = normalized_score
search_score_row.save()
return True
except RepositorySearchScore.DoesNotExist:
RepositorySearchScore.create(
repository=repo, score=normalized_score, last_updated=today
)
return True
except IntegrityError:
logger.debug("RepositorySearchScore row already existed; skipping")
return False
def missing_counts_query(date):
"""Returns a query to find all Repository's with missing RAC entries for the given date."""
subquery = (
RepositoryActionCount.select(RepositoryActionCount.id, RepositoryActionCount.repository)
.where(RepositoryActionCount.date == date)
.alias("rac")
)
return (
Repository.select()
.join(subquery, JOIN.LEFT_OUTER, on=(Repository.id == subquery.c.repository_id))
.where(subquery.c.id >> None)
)
def delete_expired_entries(repo, limit=50):
"""Deletes expired entries from the RepositoryActionCount table for a specific repository.
Returns the number of entries removed.
"""
threshold_date = datetime.today() - RAC_RETENTION_PERIOD
found = list(
RepositoryActionCount.select()
.where(
RepositoryActionCount.repository == repo, RepositoryActionCount.date < threshold_date
)
.limit(limit)
)
if not found:
return 0
assert len(found) <= limit
count_removed = 0
for entry in found:
try:
entry.delete_instance(recursive=False)
count_removed += 1
except IntegrityError:
continue
return count_removed