#!/usr/bin/python3 import sys, hashlib, os, sqlite3, shutil, json ################################################################################ # program wide constants ################################################################################ GENERAL_ERROR=1 USAGE_ERROR=2 PATH_ERROR=3 ################################################################################ # functions ################################################################################ def calculate_file_hash(algorithm, file_path, segment_size=4096): file_descriptor = open(file_path, "rb") hash_object = hashlib.new(algorithm) segment = file_descriptor.read(segment_size) while not segment==b'': hash_object.update(segment) segment = file_descriptor.read(segment_size) return hash_object.hexdigest() def calculate_parity(byte_string): parity_byte = 0b00000000 for i in range(len(byte_string)): parity_byte = parity_byte ^ byte_string[i] return parity_byte def gegerate_parity_file(input_path, parity_bytes, output_path): input_file = open(input_path, "rb") output_file = open(output_path, "wb") segment = input_file.read(parity_bytes) while not segment==b'': output_file.write((calculate_parity(segment)).to_bytes(1, byteorder='big')) segment = input_file.read(parity_bytes) input_file.close() output_file.close() def create_container(storage_directory, parity=False, parity_bytes=512, checksum_algorithm='sha512', compress=False): # prepare storage directory os.makedirs(storage_directory) os.mkdir(os.path.join(storage_directory, "objects")) if (parity): os.mkdir(os.path.join(storage_directory, "parity")) db = sqlite3.connect(os.path.join(storage_directory, "container.sqlite")) cursor = db.cursor() # settings cursor.execute("CREATE TABLE settings (option TEXT, value TEXT);") cursor.execute("INSERT INTO settings VALUES ('parity', ?);", (str(parity),)) cursor.execute("INSERT INTO settings VALUES ('parity_bytes', ?);", (str(parity_bytes),)) cursor.execute("INSERT INTO settings VALUES ('checksum_algorithm', ?);", (checksum_algorithm,)) cursor.execute("INSERT INTO settings VALUES ('compress', ?);", (str(compress),)) # container cursor.execute("CREATE TABLE hashes (id INTEGER PRIMARY KEY, hash TEXT UNIQUE);") # primary key somehow makes this similar to auto-increment (not actually auto increment but good enough for my use) cursor.execute("CREATE TABLE tags (id INTEGER, tag TEXT);") db.commit() db.close() def add_tag(storage_directory, file_hash, tag): db = sqlite3.connect(os.path.join(storage_directory, "container.sqlite")) cursor = db.cursor() cursor.execute("SELECT id FROM hashes WHERE hash=?", (file_hash,)) row = cursor.fetchone() if row == None: cursor.execute("INSERT INTO hashes (hash) VALUES (?)", (file_hash,)) db.commit() cursor.execute("SELECT id FROM hashes WHERE hash=?", (file_hash,)) row = cursor.fetchone() internal_id = row[0] cursor.execute("SELECT tag FROM tags WHERE id=?", (internal_id,)) rows = cursor.fetchall() tags = [] for row in rows: tags = tags+[row[0]] tag_already_present = True if not tag in tags: tag_already_present = False cursor.execute("INSERT INTO tags (id, tag) VALUES (?, ?)", (internal_id, tag)) db.commit() db.close() return not tag_already_present def get_tags_by_hash(storage_directory, file_hash): db = sqlite3.connect(os.path.join(storage_directory, "container.sqlite")) cursor = db.cursor() tags = None cursor.execute("SELECT id FROM hashes WHERE hash=?", (file_hash,)) row = cursor.fetchone() if not row == None: internal_id = row[0] cursor.execute("SELECT tag FROM tags WHERE id=?", (internal_id,)) rows = cursor.fetchall() tags = [] for row in rows: tags = tags+[row[0]] db.close() return tags def get_hashes_by_tag(storage_directory, tag): db = sqlite3.connect(os.path.join(storage_directory, "container.sqlite")) cursor = db.cursor() cursor.execute("SELECT id FROM tags WHERE tag=?", (tag,)) rows = cursor.fetchall() internal_ids = [] for row in rows: internal_ids = internal_ids+[row[0]] file_hashes = [] for internal_id in internal_ids: cursor.execute("SELECT hash FROM hashes WHERE id=?", (internal_id,)) row = cursor.fetchone() file_hashes = file_hashes+[row[0]] return file_hashes def file_hash_or_path_is_known_hash(storage_directory, file_hash_or_path, compress): suffix="" if compress: suffix=".xz" #that last part is needed bc os.path.join automatically discards everything before the last element containing an absolute path so if file_hash_or_path is an absolute path that would be what join gives back return os.path.isfile(os.path.join(storage_directory, "objects", file_hash_or_path+suffix)) and not os.path.isfile(file_hash_or_path) def load_container_settings(storage_directory): if not os.path.isfile(os.path.join(storage_directory, "container.sqlite")): return (PATH_ERROR, None, None, None, None) db = sqlite3.connect(os.path.join(storage_directory, "container.sqlite")) cursor = db.cursor() cursor.execute("SELECT option, value FROM settings") #TODO: check container settings properly instead of just assuming default values for things that aren't found # set default values and then read the db, just in case... parity = False parity_bytes = 512 checksum_algorithm = "sha512" compress = False for row in cursor.fetchall(): if row[0]=="parity": parity = row[1]==str(True) if row[0]=="parity_bytes": parity_bytes = int(row[1]) if row[0]=="checksum_algorithm": checksum_algorithm = row[1] if row[0]=="compress": compress = row[1]==str(True) db.close() # check storage container against settings #TODO: check compression ? if parity and not os.path.isdir(os.path.join(storage_directory, "parity")): return (GENERAL_ERROR, None, None, checksum_algorithm, compress) return (0, parity, parity_bytes, checksum_algorithm, compress) ################################################################################ # main program ################################################################################ if __name__ == "__main__": USAGE="""Usage: """+sys.argv[0]+""" create [parity=] [parity-bytes=] [checksum-algorithm=] [compress=] - set up a new storage directory """+sys.argv[0]+""" add - add tags to a file in the storage, if file is not already in the storage, same as add+file """+sys.argv[0]+""" add+file - copy a file to the storage and add tags """+sys.argv[0]+""" add+file+move - move a file to the storage and add tags """+sys.argv[0]+""" remove - remove a file from the storage, return error if not found or multiple found """+sys.argv[0]+""" remove+multi - remove a ll found files from the storage, return error if not found """+sys.argv[0]+""" search - return paths and tags of all found files """+sys.argv[0]+""" search+first - return hash and tags of first found file """+sys.argv[0]+""" search+unique - return hash and tags of the found file, return error if not found or multiple found """+sys.argv[0]+""" lookup - return hash and tags of all found files """+sys.argv[0]+""" lookup+first - return hash and tags of first found file """+sys.argv[0]+""" lookup+unique - return hash and tags of the found file, return error if not found or multiple found """+sys.argv[0]+""" link - add a symlink in that points to the referenced file """+sys.argv[0]+""" check - check file contents against hash """+sys.argv[0]+""" check+parity - check file contents against hash and parity file """+sys.argv[0]+""" check+all - check all files against their hashes """+sys.argv[0]+""" check+all+parity - check all files against their hashes and parity files """+sys.argv[0]+""" update - update the hash (and parity if applickable) of the specified file (specify by previous hash) """+sys.argv[0]+""" update+all - update hashes (and parities if applickable) of all mismatching files """+sys.argv[0]+""" fix - attempt to fix the file using parity """+sys.argv[0]+""" fix+all - attempt to fix all files using parity """+sys.argv[0]+""" help - display this message """ #TODO: +path modifier for things that return a hash to return the path to the stored file instead #TODO: +hash and +tags modifier for lookup #TODO: condense modifiers onto the same lines as the main subcommand where possible #TODO: clarification of <> and [] #TODO: subcommand to change container settings VALID_COMMANDS=["create", "add", "remove", "search", "lookup", "link", "check", "update", "fix", "help"] #TODO: (*fully) implemented subcommands: *create, *add, *lookup, *link, *help #TODO: unimplemented subcommands: remove, search, check, update, fix try: command = sys.argv[1].split("+") except IndexError: print("No subcommand specified.", file=sys.stderr) print(USAGE, file=sys.stderr) sys.exit(USAGE_ERROR) if not command[0] in VALID_COMMANDS: print("Invalid command: "+command[0], file=sys.stderr) print(USAGE, file=sys.stderr) sys.exit(USAGE_ERROR) # create subcommand: create a new directory containing a folder for stored objects, one for parity files and one for # arguments: [parity=] [parity-bytes=] [checksum-algorithm=] [compress=] if command[0] == "create": if len(sys.argv)<3: print("Too few arguments!", file=sys.stderr) print(USAGE, file=sys.stderr) sys.exit(USAGE_ERROR) storage_directory=sys.argv[2] if os.path.exists(storage_directory): print("Target path already exists. Please choose a different location.", file=sys.stderr) sys.exit(GENERAL_ERROR) # default options parity = False parity_bytes = 512 checksum_algorithm = "sha512" compress = False # check for command line options if len(sys.argv)>3: arguments = sys.argv[3:] for argument in arguments: if not len(argument.split("="))==2: print("Arguments to \"create\" always follow the scheme