diff --git a/tools/check_metafile_consistency.py b/tools/check_metafile_consistency.py new file mode 100644 index 000000000..063d4ed13 --- /dev/null +++ b/tools/check_metafile_consistency.py @@ -0,0 +1,144 @@ +import io +import sys +import argparse +import json +from pathlib import Path +import os +import configparser +import re +import traceback + + +cloudPath = None +metaPath = None +journalPath = None +cachePath = None +bigObjectSet = set() + +def get_envvar(match): + return os.environ[match.group(1)] + +def resolve_envvars(setting): + result = str(setting) + pattern = ("\$\{(.*)\}") + result = re.sub(pattern, get_envvar, setting) + return result + +def parseArgs(): + global cloudPath + global metaPath + global journalPath + global cachePath + + parser = argparse.ArgumentParser(description="Verifies that the fake-cloud and cache contain what the metadata files say") + parser.add_argument("config_file", type=str, help="The storagemanager.cnf file") + args = parser.parse_args() + config = configparser.ConfigParser() + try: + config.read(args.config_file) + cloudPath = Path(resolve_envvars(config["LocalStorage"]["path"])) + metaPath = Path(resolve_envvars(config["ObjectStorage"]["metadata_path"])) + cachePath = Path(resolve_envvars(config["Cache"]["path"])) + journalPath = Path(resolve_envvars(config["ObjectStorage"]["journal_path"])) + #print("{}\n{}\n{}\n{}".format(cloudPath, metaPath, cachePath, journalPath)) + + except Exception as e: + parser.error("Failed to parse the config file. Got '{}'".format(e)) + + if not Path(cloudPath).is_dir() or not Path(metaPath).is_dir() or not Path(journalPath).is_dir() or not Path(cachePath).is_dir(): + parser.error("cloudpath, metapath, and journalpath need to be directories.") + +def key_breakout(key): + return key.split("_", 3) + +def validateMetadata(metafile): + try: + metadata = json.load(open(metafile)) + + for obj in metadata["objects"]: + bigObjectSet.add(obj["key"]) + fields = key_breakout(obj["key"]) + cPath = cachePath / obj["key"] + l_cloudPath = cloudPath / obj["key"] + #if fields[2] != obj["length"]: + # print("object {}: in metadata length is {}, key says {}".format(obj["key"], obj["length"], fields[2])) + if fields[1] != obj["offset"]: + print("object {}: in metadata offset is {}, key says {}".format(obj["key"], obj["offset"], fields[1])) + + realSize = -1 + if cPath.exists(): + inCache = True + realSize = cPath.stat().st_size + else: + inCache = False + if l_cloudPath.exists(): + inCloud = True + realSize = l_cloudPath.stat().st_size + else: + inCloud = False + if not inCache and not inCloud: + print("{} does not exist in cache or the cloud".format(obj["key"])) + continue + + # There are a couple cases where the length field and actual file size legitmately + # don't match. + # 1) IOC::truncate() currently doesn't rename the object on truncate for + # performance reasons. + # 2) IOC::write() currently does the same on modifying an existing object. + # In that case, we can validate the length by parsing the journal file as well. + #if int(obj["length"]) != realSize: + # print("{} has the wrong length in its key. Actual length is {}.".format(obj["key"], realSize)) + + except Exception as e: + print("Failed to parse {}, got {}".format(metafile, e)) + traceback.print_exc() + + +def walkMetaDir(basepath): + for p in basepath.iterdir(): + if p.is_dir(): + #print("Recursing on {}".format(p)) + walkMetaDir(p) + elif p.is_file(): + if p.suffix == ".meta": + validateMetadata(p) + else: + print("{} is not a metadata file".format(p)) + else: + print("{} is not a metadata file".format(p)) + +# Verifies that everything in journalPath has a corresponding object in cloud/cache +def verifyValidJournalFiles(): + for p in journalPath.iterdir(): + l_cachePath = cachePath/(p.stem); + l_cloudPath = cloudPath/(p.stem); + if not l_cachePath.is_file() and not l_cloudPath.is_file(): + print("Journal file {} has no corresponding object in cache or cloud storage".format(p)) + +def verifyNoOrphans(): + for path in cloudPath.iterdir(): + if path.name not in bigObjectSet: + print("{} is in cloud storage but not referenced by any metadata file".format(path.name)) + + + for path in cachePath.iterdir(): + if path.name not in bigObjectSet: + print("{} is in the cache but not referenced by any metadata file".format(path.name)) + +def main(): + parseArgs() + + print("Verifying that all objects in metadata exist in cloud storage or the cache") + walkMetaDir(metaPath) + print("Verifying that all journal files have a corresponding object") + verifyValidJournalFiles() + print("Verifying that all objects in cloud & cache are referenced by metadata") + verifyNoOrphans() + print("Done") + sys.exit(0) + + + +if (__name__ == "__main__"): + main() +