filetags/filetags.py

445 lines
19 KiB
Python
Raw Normal View History

#!/usr/bin/python3
2021-12-20 02:04:57 +01:00
import sys, hashlib, os, sqlite3, shutil, json
################################################################################
# program wide constants
################################################################################
GENERAL_ERROR=1
USAGE_ERROR=2
PATH_ERROR=3
################################################################################
# functions
################################################################################
def calculate_file_hash(algorithm, file_path, segment_size=4096):
file_descriptor = open(file_path, "rb")
hash_object = hashlib.new(algorithm)
segment = file_descriptor.read(segment_size)
while not segment==b'':
hash_object.update(segment)
segment = file_descriptor.read(segment_size)
return hash_object.hexdigest()
def calculate_parity(byte_string):
parity_byte = 0b00000000
for i in range(len(byte_string)):
parity_byte = parity_byte ^ byte_string[i]
return parity_byte
def gegerate_parity_file(input_path, parity_bytes, output_path):
input_file = open(input_path, "rb")
output_file = open(output_path, "wb")
segment = input_file.read(parity_bytes)
while not segment==b'':
output_file.write((calculate_parity(segment)).to_bytes(1, byteorder='big'))
segment = input_file.read(parity_bytes)
input_file.close()
output_file.close()
def create_container(storage_directory, parity=False, parity_bytes=512, checksum_algorithm='sha512', compress=False):
# prepare storage directory
os.makedirs(storage_directory)
os.mkdir(os.path.join(storage_directory, "objects"))
if (parity):
os.mkdir(os.path.join(storage_directory, "parity"))
db = sqlite3.connect(os.path.join(storage_directory, "container.sqlite"))
cursor = db.cursor()
# settings
cursor.execute("CREATE TABLE settings (option TEXT, value TEXT);")
cursor.execute("INSERT INTO settings VALUES ('parity', ?);", (str(parity),))
cursor.execute("INSERT INTO settings VALUES ('parity_bytes', ?);", (str(parity_bytes),))
cursor.execute("INSERT INTO settings VALUES ('checksum_algorithm', ?);", (checksum_algorithm,))
cursor.execute("INSERT INTO settings VALUES ('compress', ?);", (str(compress),))
# container
cursor.execute("CREATE TABLE hashes (id INTEGER PRIMARY KEY, hash TEXT UNIQUE);") # primary key somehow makes this similar to auto-increment (not actually auto increment but good enough for my use)
cursor.execute("CREATE TABLE tags (id INTEGER, tag TEXT);")
db.commit()
db.close()
def add_tag(storage_directory, file_hash, tag):
db = sqlite3.connect(os.path.join(storage_directory, "container.sqlite"))
cursor = db.cursor()
cursor.execute("SELECT id FROM hashes WHERE hash=?", (file_hash,))
row = cursor.fetchone()
if row == None:
cursor.execute("INSERT INTO hashes (hash) VALUES (?)", (file_hash,))
db.commit()
cursor.execute("SELECT id FROM hashes WHERE hash=?", (file_hash,))
row = cursor.fetchone()
internal_id = row[0]
cursor.execute("SELECT tag FROM tags WHERE id=?", (internal_id,))
rows = cursor.fetchall()
tags = []
for row in rows:
tags = tags+[row[0]]
tag_already_present = True
if not tag in tags:
tag_already_present = False
cursor.execute("INSERT INTO tags (id, tag) VALUES (?, ?)", (internal_id, tag))
db.commit()
db.close()
return not tag_already_present
2021-12-11 08:01:13 +01:00
def get_tags_by_hash(storage_directory, file_hash):
db = sqlite3.connect(os.path.join(storage_directory, "container.sqlite"))
cursor = db.cursor()
tags = None
cursor.execute("SELECT id FROM hashes WHERE hash=?", (file_hash,))
row = cursor.fetchone()
if not row == None:
internal_id = row[0]
cursor.execute("SELECT tag FROM tags WHERE id=?", (internal_id,))
rows = cursor.fetchall()
tags = []
for row in rows:
tags = tags+[row[0]]
db.close()
return tags
def get_hashes_by_tag(storage_directory, tag):
db = sqlite3.connect(os.path.join(storage_directory, "container.sqlite"))
cursor = db.cursor()
cursor.execute("SELECT id FROM tags WHERE tag=?", (tag,))
rows = cursor.fetchall()
internal_ids = []
for row in rows:
internal_ids = internal_ids+[row[0]]
file_hashes = []
for internal_id in internal_ids:
cursor.execute("SELECT hash FROM hashes WHERE id=?", (internal_id,))
row = cursor.fetchone()
file_hashes = file_hashes+[row[0]]
return file_hashes
def file_hash_or_path_is_known_hash(storage_directory, file_hash_or_path, compress):
2021-12-11 08:01:13 +01:00
suffix=""
if compress:
suffix=".xz"
#that last part is needed bc os.path.join automatically discards everything before the last element containing an absolute path so if file_hash_or_path is an absolute path that would be what join gives back
return os.path.isfile(os.path.join(storage_directory, "objects", file_hash_or_path+suffix)) and not os.path.isfile(file_hash_or_path)
def load_container_settings(storage_directory):
if not os.path.isfile(os.path.join(storage_directory, "container.sqlite")):
return (PATH_ERROR, None, None, None, None)
db = sqlite3.connect(os.path.join(storage_directory, "container.sqlite"))
cursor = db.cursor()
cursor.execute("SELECT option, value FROM settings")
2021-12-20 02:04:57 +01:00
#TODO: check container settings properly instead of just assuming default values for things that aren't found
# set default values and then read the db, just in case...
parity = False
parity_bytes = 512
checksum_algorithm = "sha512"
compress = False
for row in cursor.fetchall():
if row[0]=="parity":
parity = row[1]==str(True)
if row[0]=="parity_bytes":
parity_bytes = int(row[1])
if row[0]=="checksum_algorithm":
checksum_algorithm = row[1]
if row[0]=="compress":
compress = row[1]==str(True)
db.close()
# check storage container against settings
2021-12-11 08:01:13 +01:00
#TODO: check compression ?
if parity and not os.path.isdir(os.path.join(storage_directory, "parity")):
return (GENERAL_ERROR, None, None, checksum_algorithm, compress)
return (0, parity, parity_bytes, checksum_algorithm, compress)
################################################################################
# main program
################################################################################
if __name__ == "__main__":
USAGE="""Usage:
"""+sys.argv[0]+""" create <storage directory> [parity=<on|off>] [parity-bytes=<number of bytes for each parity byte>] [checksum-algorithm=<algorithm>] [compress=<on|off>] - set up a new storage directory
"""+sys.argv[0]+""" add <storage directory> <hash|file> <tags ...> - add tags to a file in the storage, if file is not already in the storage, same as add+file
"""+sys.argv[0]+""" add+file <storage directory> <file> <tags ...> - copy a file to the storage and add tags
"""+sys.argv[0]+""" add+file+move <storage directory> <file> <tags ...> - move a file to the storage and add tags
"""+sys.argv[0]+""" remove <storage directory> <hash|unique tag or tag set> - remove a file from the storage, return error if not found or multiple found
"""+sys.argv[0]+""" remove+multi <storage directory> <exact tag or set of exact tags> - remove a ll found files from the storage, return error if not found
"""+sys.argv[0]+""" search <storage directory> <tags or partial tags> - return paths and tags of all found files
"""+sys.argv[0]+""" search+first <storage directory> <tags or partial tags> - return hash and tags of first found file
"""+sys.argv[0]+""" search+unique <storage directory> <tags or partial tags> - return hash and tags of the found file, return error if not found or multiple found
2021-12-11 08:01:13 +01:00
"""+sys.argv[0]+""" lookup <storage directory> <hash|exact tag|set of exact tags> - return hash and tags of all found files
"""+sys.argv[0]+""" lookup+first <storage directory> <hash|exact tag|set of exact tags> - return hash and tags of first found file
"""+sys.argv[0]+""" lookup+unique <storage directory> <hash|exact tag|set of exact tags> - return hash and tags of the found file, return error if not found or multiple found
"""+sys.argv[0]+""" link <storage directory> <hash> <location> - add a symlink in <location> that points to the referenced file
"""+sys.argv[0]+""" check <storage directory> <hash> - check file contents against hash
"""+sys.argv[0]+""" check+parity <storage directory> <hash> - check file contents against hash and parity file
"""+sys.argv[0]+""" check+all <storage directory> - check all files against their hashes
"""+sys.argv[0]+""" check+all+parity <storage directory> - check all files against their hashes and parity files
"""+sys.argv[0]+""" update <storage directory> <hash> - update the hash (and parity if applickable) of the specified file (specify by previous hash)
"""+sys.argv[0]+""" update+all <storage directory> - update hashes (and parities if applickable) of all mismatching files
"""+sys.argv[0]+""" fix <storage directory> <hash> - attempt to fix the file using parity
"""+sys.argv[0]+""" fix+all <storage directory> - attempt to fix all files using parity
"""+sys.argv[0]+""" help - display this message
"""
#TODO: +path modifier for things that return a hash to return the path to the stored file instead
2021-12-11 08:27:26 +01:00
#TODO: +hash and +tags modifier for lookup
#TODO: condense modifiers onto the same lines as the main subcommand where possible
#TODO: clarification of <> and []
#TODO: subcommand to change container settings
VALID_COMMANDS=["create", "add", "remove", "search", "lookup", "link", "check", "update", "fix", "help"]
#TODO: (*fully) implemented subcommands: *create, *add, *lookup, *link, *help
#TODO: unimplemented subcommands: remove, search, check, update, fix
try:
command = sys.argv[1].split("+")
except IndexError:
2021-12-11 08:01:13 +01:00
print("No subcommand specified.", file=sys.stderr)
print(USAGE, file=sys.stderr)
sys.exit(USAGE_ERROR)
if not command[0] in VALID_COMMANDS:
print("Invalid command: "+command[0], file=sys.stderr)
print(USAGE, file=sys.stderr)
sys.exit(USAGE_ERROR)
# create subcommand: create a new directory containing a folder for stored objects, one for parity files and one for
# arguments: <storage directory> [parity=<on|off>] [parity-bytes=<number of bytes for each parity byte>] [checksum-algorithm=<algorithm>] [compress=<on|off>]
if command[0] == "create":
if len(sys.argv)<3:
print("Too few arguments!", file=sys.stderr)
print(USAGE, file=sys.stderr)
sys.exit(USAGE_ERROR)
storage_directory=sys.argv[2]
if os.path.exists(storage_directory):
print("Target path already exists. Please choose a different location.", file=sys.stderr)
sys.exit(GENERAL_ERROR)
# default options
parity = False
parity_bytes = 512
checksum_algorithm = "sha512"
compress = False
# check for command line options
if len(sys.argv)>3:
arguments = sys.argv[3:]
for argument in arguments:
if not len(argument.split("="))==2:
print("Arguments to \"create\" always follow the scheme <option>=<value>.", file=sys.stderr)
print(USAGE, file=sys.stderr)
sys.exit(USAGE_ERROR)
option = argument.split("=")[0]
value = argument.split("=")[1]
if not option in ["parity", "parity-bytes", "checksum-algorithm", "compress"]:
print("Unknown option: "+option, file=sys.stderr)
print(USAGE, file=sys.stderr)
sys.exit(USAGE_ERROR)
if option=="parity":
if not value in ["on", "off"]:
print("Option \"parity\" accepts either \"on\" or \"off\".", file=sys.stderr)
sys.exit(USAGE_ERROR)
if value=="on":
parity = True
if option=="parity-bytes":
try:
parity_bytes = int(value)
except ValueError:
print("Option \"parity-bytes\" only accepts integers.", file=sys.stderr)
sys.exit(USAGE_ERROR)
if option=="checksum-algorithm":
if not value in hashlib.algorithms_available:
print("Chacksum algorithm \""+value+"\" not available.")
sys.exit(USAGE_ERROR)
checksum_algorithm = value
if option=="compress":
if not value in ["on", "off"]:
print("Option \"compress\" accepts either \"on\" or \"off\".", file=sys.stderr)
sys.exit(USAGE_ERROR)
if value=="on":
compress = True
create_container(storage_directory, parity=parity, parity_bytes=parity_bytes, checksum_algorithm=checksum_algorithm, compress=compress)
sys.exit(0)
# add subcommand: add a file to the storage container or add tags to it
# arguments:
# <storage directory> <hash|file> <tags ...>
# modifiers:
# file - requires a file path; adds a new file (or if file already in storage adds tags to that file), checks for collisions by comparing file size
# move - requires file modifier; moves the file to the storage dir instead of copying it
if command[0] == "add":
2021-12-11 08:01:13 +01:00
if len(sys.argv)<5:
print("Too few arguments!", file=sys.stderr)
print(USAGE, file=sys.stderr)
sys.exit(USAGE_ERROR)
storage_directory = sys.argv[2]
status, parity, parity_bytes, checksum_algorithm, compress = load_container_settings(storage_directory)
if not status==0:
if status==PATH_ERROR:
print("Invalid storage directory!", file=sys.stderr)
print(USAGE, file=sys.stderr)
if status==GENERAL_ERROR:
print("Verifying container settings failed.", file=sys.stderr)
sys.exit(status)
file_hash_or_path = sys.argv[3]
hash_allowed=True
if 'file' in command:
hash_allowed=False
if not any([hash_allowed and file_hash_or_path_is_known_hash(storage_directory, file_hash_or_path, compress), os.path.isfile(file_hash_or_path)]):
print("Unknown file!", file=sys.stderr)
print(USAGE, file=sys.stderr)
sys.exit(PATH_ERROR)
tags = sys.argv[4:]
if hash_allowed and file_hash_or_path_is_known_hash(storage_directory, file_hash_or_path, compress):
file_hash = file_hash_or_path
print("Hash for file in storage: "+file_hash)
else:
file_hash = calculate_file_hash(checksum_algorithm, file_hash_or_path)
if file_hash_or_path_is_known_hash(storage_directory, file_hash, compress):
print("File already in storage.")
#this assumes that the storage directory has not been tampered with or corrupted, FIXME!
if 'move' in command:
print("Removing external file.")
os.remove(file_hash_or_path)
else:
if 'move' in command:
print("Moving file to storage.")
shutil.move(file_hash_or_path, os.path.join(storage_directory, "objects", file_hash))
else:
print("Copying file to storage.")
shutil.copyfile(file_hash_or_path, os.path.join(storage_directory, "objects", file_hash))
if parity:
gegerate_parity_file(os.path.join(storage_directory, "objects", file_hash), parity_bytes, os.path.join(storage_directory, "parity", file_hash))
if compress:
2021-12-11 08:01:13 +01:00
print("Compressing...")
xz_process = os.popen("xz --best -T0 "+os.path.join(storage_directory, "objects", file_hash))
if parity:
xz_process = os.popen("xz --best -T0 "+os.path.join(storage_directory, "parity", file_hash))
for tag in tags:
print("Adding tag: "+tag)
if add_tag(storage_directory, file_hash, tag):
print("Added.")
else:
print("Tag already present.")
2021-12-11 08:01:13 +01:00
# lookup subcommand: return hash and tags of found files
# arguments: <storage directory> <hash|exact tag|set of exact tags>
# modifiers:
# first - only return one file
2021-12-11 08:01:13 +01:00
# unique - return error if not found or multiple found
# hash - perform lookup by hash
# tags - perform lookup by tag or set of tags
2021-12-11 08:01:13 +01:00
if command[0] == "lookup":
if len(sys.argv)<4:
print("Too few arguments!", file=sys.stderr)
print(USAGE, file=sys.stderr)
sys.exit(USAGE_ERROR)
storage_directory = sys.argv[2]
status, parity, parity_bytes, checksum_algorithm, compress = load_container_settings(storage_directory)
if not status==0:
if status==PATH_ERROR:
print("Invalid storage directory!", file=sys.stderr)
print(USAGE, file=sys.stderr)
if status==GENERAL_ERROR:
print("Verifying container settings failed.", file=sys.stderr)
sys.exit(status)
file_tags_or_hash = sys.argv[3:]
2021-12-20 02:04:57 +01:00
lookup_results = {}
if not 'tags' in command:
if file_hash_or_path_is_known_hash(storage_directory, file_tags_or_hash[0], compress):
2021-12-20 02:04:57 +01:00
lookup_results[file_tags_or_hash[0]] = get_tags_by_hash(storage_directory, file_tags_or_hash[0])
if not 'hash' in command:
# create a two dimensional array of all the files associated with each individual tag
file_hash_lists = []
for tag in file_tags_or_hash:
file_hash_lists = file_hash_lists + [get_hashes_by_tag(storage_directory, tag)]
# take the first of the arrays in the two dimensional array
common_file_hashes = file_hash_lists[0]
# iterate over the two dimensional array
for file_hash_list in file_hash_lists:
# check each element in common_file_hashes to ensure it is also in all other arrays in the two dimensional array, remove if it isnt
for file_hash in common_file_hashes:
if not file_hash in file_hash_list:
common_file_hashes.remove(file_hash)
2021-12-20 02:04:57 +01:00
for file_hash in common_file_hashes:
lookup_results[file_hash] = get_tags_by_hash(storage_directory, file_hash)
if 'unique' in command:
if(len(lookup_results)==1):
print(json.dumps(lookup_results))
else:
print("More than one matching file found.", file=sys.stderr)
sys.exit(GENERAL_ERROR)
elif 'first' in command:
file_hash, tags = list(lookup_results.items())[0]
print(json.dumps({file_hash : tags}))
else:
print(json.dumps(lookup_results))
2021-12-11 08:01:13 +01:00
2021-12-20 11:48:54 +01:00
# link subcommand: add a symlink in <location> that points to the referenced file
# arguments:
# <storage directory> <hash> <location>
if command[0] == "link":
if len(sys.argv)<5:
print("Too few arguments!", file=sys.stderr)
print(USAGE, file=sys.stderr)
sys.exit(USAGE_ERROR)
storage_directory = sys.argv[2]
file_hash = sys.argv[3]
link_location = sys.argv[4]
status, parity, parity_bytes, checksum_algorithm, compress = load_container_settings(storage_directory)
if not status==0:
if status==PATH_ERROR:
print("Invalid storage directory!", file=sys.stderr)
print(USAGE, file=sys.stderr)
if status==GENERAL_ERROR:
print("Verifying container settings failed.", file=sys.stderr)
sys.exit(status)
if file_hash_or_path_is_known_hash(storage_directory, file_hash, compress):
if os.path.isdir(os.sep.join(link_location.split(os.sep)[:-1])):
if os.path.exists(link_location):
print(link_location+": file already exists.", file=sys.stderr)
sys.exit(GENERAL_ERROR)
else:
suffix = ""
if compress:
suffix = ".xz"
object_path = os.path.join(storage_directory, "objects", file_hash+suffix)
os.symlink(object_path, link_location)
print(link_location+" -> "+object_path)
else:
print("Parent directory "+os.sep.join(link_location.split(os.sep)[:-1])+" does not exist.", file=sys.stderr)
sys.exit(GENERAL_ERROR)
# help subcommand
if command[0] == "help":
print(USAGE)
sys.exit(0)
2021-12-11 08:01:13 +01:00
# this line is here to work around a bug in Xed