2021-12-11 03:16:08 +01:00
#!/usr/bin/python3
2021-12-20 02:04:57 +01:00
import sys , hashlib , os , sqlite3 , shutil , json
2021-12-11 03:16:08 +01:00
################################################################################
# program wide constants
################################################################################
GENERAL_ERROR = 1
USAGE_ERROR = 2
PATH_ERROR = 3
################################################################################
# functions
################################################################################
def calculate_file_hash ( algorithm , file_path , segment_size = 4096 ) :
file_descriptor = open ( file_path , " rb " )
hash_object = hashlib . new ( algorithm )
segment = file_descriptor . read ( segment_size )
while not segment == b ' ' :
hash_object . update ( segment )
segment = file_descriptor . read ( segment_size )
return hash_object . hexdigest ( )
def calculate_parity ( byte_string ) :
parity_byte = 0b00000000
for i in range ( len ( byte_string ) ) :
parity_byte = parity_byte ^ byte_string [ i ]
return parity_byte
def gegerate_parity_file ( input_path , parity_bytes , output_path ) :
input_file = open ( input_path , " rb " )
output_file = open ( output_path , " wb " )
segment = input_file . read ( parity_bytes )
while not segment == b ' ' :
output_file . write ( ( calculate_parity ( segment ) ) . to_bytes ( 1 , byteorder = ' big ' ) )
segment = input_file . read ( parity_bytes )
input_file . close ( )
output_file . close ( )
def create_container ( storage_directory , parity = False , parity_bytes = 512 , checksum_algorithm = ' sha512 ' , compress = False ) :
# prepare storage directory
os . makedirs ( storage_directory )
os . mkdir ( os . path . join ( storage_directory , " objects " ) )
if ( parity ) :
os . mkdir ( os . path . join ( storage_directory , " parity " ) )
db = sqlite3 . connect ( os . path . join ( storage_directory , " container.sqlite " ) )
cursor = db . cursor ( )
# settings
cursor . execute ( " CREATE TABLE settings (option TEXT, value TEXT); " )
cursor . execute ( " INSERT INTO settings VALUES ( ' parity ' , ?); " , ( str ( parity ) , ) )
cursor . execute ( " INSERT INTO settings VALUES ( ' parity_bytes ' , ?); " , ( str ( parity_bytes ) , ) )
cursor . execute ( " INSERT INTO settings VALUES ( ' checksum_algorithm ' , ?); " , ( checksum_algorithm , ) )
cursor . execute ( " INSERT INTO settings VALUES ( ' compress ' , ?); " , ( str ( compress ) , ) )
# container
cursor . execute ( " CREATE TABLE hashes (id INTEGER PRIMARY KEY, hash TEXT UNIQUE); " ) # primary key somehow makes this similar to auto-increment (not actually auto increment but good enough for my use)
cursor . execute ( " CREATE TABLE tags (id INTEGER, tag TEXT); " )
db . commit ( )
db . close ( )
def add_tag ( storage_directory , file_hash , tag ) :
db = sqlite3 . connect ( os . path . join ( storage_directory , " container.sqlite " ) )
cursor = db . cursor ( )
cursor . execute ( " SELECT id FROM hashes WHERE hash=? " , ( file_hash , ) )
row = cursor . fetchone ( )
if row == None :
cursor . execute ( " INSERT INTO hashes (hash) VALUES (?) " , ( file_hash , ) )
db . commit ( )
cursor . execute ( " SELECT id FROM hashes WHERE hash=? " , ( file_hash , ) )
row = cursor . fetchone ( )
internal_id = row [ 0 ]
cursor . execute ( " SELECT tag FROM tags WHERE id=? " , ( internal_id , ) )
rows = cursor . fetchall ( )
tags = [ ]
for row in rows :
tags = tags + [ row [ 0 ] ]
tag_already_present = True
if not tag in tags :
tag_already_present = False
cursor . execute ( " INSERT INTO tags (id, tag) VALUES (?, ?) " , ( internal_id , tag ) )
db . commit ( )
db . close ( )
return not tag_already_present
2021-12-11 08:01:13 +01:00
def get_tags_by_hash ( storage_directory , file_hash ) :
db = sqlite3 . connect ( os . path . join ( storage_directory , " container.sqlite " ) )
cursor = db . cursor ( )
tags = None
cursor . execute ( " SELECT id FROM hashes WHERE hash=? " , ( file_hash , ) )
row = cursor . fetchone ( )
if not row == None :
internal_id = row [ 0 ]
cursor . execute ( " SELECT tag FROM tags WHERE id=? " , ( internal_id , ) )
rows = cursor . fetchall ( )
tags = [ ]
for row in rows :
tags = tags + [ row [ 0 ] ]
db . close ( )
return tags
def get_hashes_by_tag ( storage_directory , tag ) :
db = sqlite3 . connect ( os . path . join ( storage_directory , " container.sqlite " ) )
cursor = db . cursor ( )
cursor . execute ( " SELECT id FROM tags WHERE tag=? " , ( tag , ) )
rows = cursor . fetchall ( )
internal_ids = [ ]
for row in rows :
internal_ids = internal_ids + [ row [ 0 ] ]
file_hashes = [ ]
for internal_id in internal_ids :
cursor . execute ( " SELECT hash FROM hashes WHERE id=? " , ( internal_id , ) )
row = cursor . fetchone ( )
file_hashes = file_hashes + [ row [ 0 ] ]
return file_hashes
2021-12-12 10:12:58 +01:00
def file_hash_or_path_is_known_hash ( storage_directory , file_hash_or_path , compress ) :
2021-12-11 08:01:13 +01:00
suffix = " "
if compress :
suffix = " .xz "
#that last part is needed bc os.path.join automatically discards everything before the last element containing an absolute path so if file_hash_or_path is an absolute path that would be what join gives back
return os . path . isfile ( os . path . join ( storage_directory , " objects " , file_hash_or_path + suffix ) ) and not os . path . isfile ( file_hash_or_path )
2021-12-11 03:16:08 +01:00
def load_container_settings ( storage_directory ) :
if not os . path . isfile ( os . path . join ( storage_directory , " container.sqlite " ) ) :
return ( PATH_ERROR , None , None , None , None )
db = sqlite3 . connect ( os . path . join ( storage_directory , " container.sqlite " ) )
cursor = db . cursor ( )
cursor . execute ( " SELECT option, value FROM settings " )
2021-12-20 02:04:57 +01:00
#TODO: check container settings properly instead of just assuming default values for things that aren't found
2021-12-11 03:16:08 +01:00
# set default values and then read the db, just in case...
parity = False
parity_bytes = 512
checksum_algorithm = " sha512 "
compress = False
for row in cursor . fetchall ( ) :
if row [ 0 ] == " parity " :
parity = row [ 1 ] == str ( True )
if row [ 0 ] == " parity_bytes " :
parity_bytes = int ( row [ 1 ] )
if row [ 0 ] == " checksum_algorithm " :
checksum_algorithm = row [ 1 ]
if row [ 0 ] == " compress " :
compress = row [ 1 ] == str ( True )
db . close ( )
# check storage container against settings
2021-12-11 08:01:13 +01:00
#TODO: check compression ?
2021-12-11 03:16:08 +01:00
if parity and not os . path . isdir ( os . path . join ( storage_directory , " parity " ) ) :
return ( GENERAL_ERROR , None , None , checksum_algorithm , compress )
return ( 0 , parity , parity_bytes , checksum_algorithm , compress )
################################################################################
# main program
################################################################################
if __name__ == " __main__ " :
USAGE = """ Usage:
""" +sys.argv[0]+ """ create < storage directory > [ parity = < on | off > ] [ parity - bytes = < number of bytes for each parity byte > ] [ checksum - algorithm = < algorithm > ] [ compress = < on | off > ] - set up a new storage directory
""" +sys.argv[0]+ """ add < storage directory > < hash | file > < tags . . . > - add tags to a file in the storage , if file is not already in the storage , same as add + file
""" +sys.argv[0]+ """ add + file < storage directory > < file > < tags . . . > - copy a file to the storage and add tags
""" +sys.argv[0]+ """ add + file + move < storage directory > < file > < tags . . . > - move a file to the storage and add tags
""" +sys.argv[0]+ """ remove < storage directory > < hash | unique tag or tag set > - remove a file from the storage , return error if not found or multiple found
""" +sys.argv[0]+ """ remove + multi < storage directory > < exact tag or set of exact tags > - remove a ll found files from the storage , return error if not found
""" +sys.argv[0]+ """ search < storage directory > < tags or partial tags > - return paths and tags of all found files
""" +sys.argv[0]+ """ search + first < storage directory > < tags or partial tags > - return hash and tags of first found file
""" +sys.argv[0]+ """ search + unique < storage directory > < tags or partial tags > - return hash and tags of the found file , return error if not found or multiple found
2021-12-11 08:01:13 +01:00
""" +sys.argv[0]+ """ lookup < storage directory > < hash | exact tag | set of exact tags > - return hash and tags of all found files
2021-12-11 03:16:08 +01:00
""" +sys.argv[0]+ """ lookup + first < storage directory > < hash | exact tag | set of exact tags > - return hash and tags of first found file
""" +sys.argv[0]+ """ lookup + unique < storage directory > < hash | exact tag | set of exact tags > - return hash and tags of the found file , return error if not found or multiple found
""" +sys.argv[0]+ """ link < storage directory > < hash > < location > - add a symlink in < location > that points to the referenced file
""" +sys.argv[0]+ """ check < storage directory > < hash > - check file contents against hash
""" +sys.argv[0]+ """ check + parity < storage directory > < hash > - check file contents against hash and parity file
""" +sys.argv[0]+ """ check + all < storage directory > - check all files against their hashes
""" +sys.argv[0]+ """ check + all + parity < storage directory > - check all files against their hashes and parity files
""" +sys.argv[0]+ """ update < storage directory > < hash > - update the hash ( and parity if applickable ) of the specified file ( specify by previous hash )
""" +sys.argv[0]+ """ update + all < storage directory > - update hashes ( and parities if applickable ) of all mismatching files
""" +sys.argv[0]+ """ fix < storage directory > < hash > - attempt to fix the file using parity
""" +sys.argv[0]+ """ fix + all < storage directory > - attempt to fix all files using parity
""" +sys.argv[0]+ """ help - display this message
"""
#TODO: +path modifier for things that return a hash to return the path to the stored file instead
2021-12-11 08:27:26 +01:00
#TODO: +hash and +tags modifier for lookup
2021-12-11 03:16:08 +01:00
#TODO: condense modifiers onto the same lines as the main subcommand where possible
#TODO: clarification of <> and []
#TODO: subcommand to change container settings
VALID_COMMANDS = [ " create " , " add " , " remove " , " search " , " lookup " , " link " , " check " , " update " , " fix " , " help " ]
2021-12-20 11:49:56 +01:00
#TODO: (*fully) implemented subcommands: *create, *add, *lookup, *link, *help
#TODO: unimplemented subcommands: remove, search, check, update, fix
2021-12-11 03:16:08 +01:00
try :
command = sys . argv [ 1 ] . split ( " + " )
except IndexError :
2021-12-11 08:01:13 +01:00
print ( " No subcommand specified. " , file = sys . stderr )
2021-12-11 03:16:08 +01:00
print ( USAGE , file = sys . stderr )
sys . exit ( USAGE_ERROR )
if not command [ 0 ] in VALID_COMMANDS :
print ( " Invalid command: " + command [ 0 ] , file = sys . stderr )
print ( USAGE , file = sys . stderr )
sys . exit ( USAGE_ERROR )
# create subcommand: create a new directory containing a folder for stored objects, one for parity files and one for
# arguments: <storage directory> [parity=<on|off>] [parity-bytes=<number of bytes for each parity byte>] [checksum-algorithm=<algorithm>] [compress=<on|off>]
if command [ 0 ] == " create " :
if len ( sys . argv ) < 3 :
print ( " Too few arguments! " , file = sys . stderr )
print ( USAGE , file = sys . stderr )
sys . exit ( USAGE_ERROR )
storage_directory = sys . argv [ 2 ]
if os . path . exists ( storage_directory ) :
print ( " Target path already exists. Please choose a different location. " , file = sys . stderr )
sys . exit ( GENERAL_ERROR )
# default options
parity = False
parity_bytes = 512
checksum_algorithm = " sha512 "
compress = False
# check for command line options
if len ( sys . argv ) > 3 :
arguments = sys . argv [ 3 : ]
for argument in arguments :
if not len ( argument . split ( " = " ) ) == 2 :
print ( " Arguments to \" create \" always follow the scheme <option>=<value>. " , file = sys . stderr )
print ( USAGE , file = sys . stderr )
sys . exit ( USAGE_ERROR )
option = argument . split ( " = " ) [ 0 ]
value = argument . split ( " = " ) [ 1 ]
if not option in [ " parity " , " parity-bytes " , " checksum-algorithm " , " compress " ] :
print ( " Unknown option: " + option , file = sys . stderr )
print ( USAGE , file = sys . stderr )
sys . exit ( USAGE_ERROR )
if option == " parity " :
if not value in [ " on " , " off " ] :
print ( " Option \" parity \" accepts either \" on \" or \" off \" . " , file = sys . stderr )
sys . exit ( USAGE_ERROR )
if value == " on " :
parity = True
if option == " parity-bytes " :
try :
parity_bytes = int ( value )
except ValueError :
print ( " Option \" parity-bytes \" only accepts integers. " , file = sys . stderr )
sys . exit ( USAGE_ERROR )
if option == " checksum-algorithm " :
if not value in hashlib . algorithms_available :
print ( " Chacksum algorithm \" " + value + " \" not available. " )
sys . exit ( USAGE_ERROR )
checksum_algorithm = value
if option == " compress " :
if not value in [ " on " , " off " ] :
print ( " Option \" compress \" accepts either \" on \" or \" off \" . " , file = sys . stderr )
sys . exit ( USAGE_ERROR )
if value == " on " :
compress = True
create_container ( storage_directory , parity = parity , parity_bytes = parity_bytes , checksum_algorithm = checksum_algorithm , compress = compress )
sys . exit ( 0 )
# add subcommand: add a file to the storage container or add tags to it
# arguments:
# <storage directory> <hash|file> <tags ...>
# modifiers:
# file - requires a file path; adds a new file (or if file already in storage adds tags to that file), checks for collisions by comparing file size
# move - requires file modifier; moves the file to the storage dir instead of copying it
if command [ 0 ] == " add " :
2021-12-11 08:01:13 +01:00
if len ( sys . argv ) < 5 :
2021-12-11 03:16:08 +01:00
print ( " Too few arguments! " , file = sys . stderr )
print ( USAGE , file = sys . stderr )
sys . exit ( USAGE_ERROR )
storage_directory = sys . argv [ 2 ]
status , parity , parity_bytes , checksum_algorithm , compress = load_container_settings ( storage_directory )
if not status == 0 :
if status == PATH_ERROR :
print ( " Invalid storage directory! " , file = sys . stderr )
print ( USAGE , file = sys . stderr )
if status == GENERAL_ERROR :
print ( " Verifying container settings failed. " , file = sys . stderr )
sys . exit ( status )
file_hash_or_path = sys . argv [ 3 ]
hash_allowed = True
if ' file ' in command :
hash_allowed = False
2021-12-12 10:12:58 +01:00
if not any ( [ hash_allowed and file_hash_or_path_is_known_hash ( storage_directory , file_hash_or_path , compress ) , os . path . isfile ( file_hash_or_path ) ] ) :
2021-12-11 03:16:08 +01:00
print ( " Unknown file! " , file = sys . stderr )
print ( USAGE , file = sys . stderr )
sys . exit ( PATH_ERROR )
tags = sys . argv [ 4 : ]
2021-12-12 10:12:58 +01:00
if hash_allowed and file_hash_or_path_is_known_hash ( storage_directory , file_hash_or_path , compress ) :
2021-12-11 03:16:08 +01:00
file_hash = file_hash_or_path
2021-12-12 10:10:48 +01:00
print ( " Hash for file in storage: " + file_hash )
2021-12-11 03:16:08 +01:00
else :
file_hash = calculate_file_hash ( checksum_algorithm , file_hash_or_path )
2021-12-12 10:12:58 +01:00
if file_hash_or_path_is_known_hash ( storage_directory , file_hash , compress ) :
2021-12-11 03:16:08 +01:00
print ( " File already in storage. " )
#this assumes that the storage directory has not been tampered with or corrupted, FIXME!
if ' move ' in command :
print ( " Removing external file. " )
os . remove ( file_hash_or_path )
else :
if ' move ' in command :
print ( " Moving file to storage. " )
shutil . move ( file_hash_or_path , os . path . join ( storage_directory , " objects " , file_hash ) )
else :
print ( " Copying file to storage. " )
shutil . copyfile ( file_hash_or_path , os . path . join ( storage_directory , " objects " , file_hash ) )
if parity :
gegerate_parity_file ( os . path . join ( storage_directory , " objects " , file_hash ) , parity_bytes , os . path . join ( storage_directory , " parity " , file_hash ) )
if compress :
2021-12-11 08:01:13 +01:00
print ( " Compressing... " )
2021-12-11 03:16:08 +01:00
xz_process = os . popen ( " xz --best -T0 " + os . path . join ( storage_directory , " objects " , file_hash ) )
if parity :
xz_process = os . popen ( " xz --best -T0 " + os . path . join ( storage_directory , " parity " , file_hash ) )
for tag in tags :
print ( " Adding tag: " + tag )
if add_tag ( storage_directory , file_hash , tag ) :
print ( " Added. " )
else :
print ( " Tag already present. " )
2021-12-11 08:01:13 +01:00
# lookup subcommand: return hash and tags of found files
# arguments: <storage directory> <hash|exact tag|set of exact tags>
# modifiers:
2021-12-12 09:40:25 +01:00
# first - only return one file
2021-12-11 08:01:13 +01:00
# unique - return error if not found or multiple found
2021-12-12 09:40:25 +01:00
# hash - perform lookup by hash
# tags - perform lookup by tag or set of tags
2021-12-11 08:01:13 +01:00
if command [ 0 ] == " lookup " :
if len ( sys . argv ) < 4 :
print ( " Too few arguments! " , file = sys . stderr )
print ( USAGE , file = sys . stderr )
sys . exit ( USAGE_ERROR )
storage_directory = sys . argv [ 2 ]
status , parity , parity_bytes , checksum_algorithm , compress = load_container_settings ( storage_directory )
if not status == 0 :
if status == PATH_ERROR :
print ( " Invalid storage directory! " , file = sys . stderr )
print ( USAGE , file = sys . stderr )
if status == GENERAL_ERROR :
print ( " Verifying container settings failed. " , file = sys . stderr )
sys . exit ( status )
2021-12-12 09:43:03 +01:00
file_tags_or_hash = sys . argv [ 3 : ]
2021-12-20 02:04:57 +01:00
lookup_results = { }
2021-12-12 09:40:25 +01:00
if not ' tags ' in command :
2021-12-12 10:12:58 +01:00
if file_hash_or_path_is_known_hash ( storage_directory , file_tags_or_hash [ 0 ] , compress ) :
2021-12-20 02:04:57 +01:00
lookup_results [ file_tags_or_hash [ 0 ] ] = get_tags_by_hash ( storage_directory , file_tags_or_hash [ 0 ] )
2021-12-12 09:40:25 +01:00
if not ' hash ' in command :
# create a two dimensional array of all the files associated with each individual tag
file_hash_lists = [ ]
for tag in file_tags_or_hash :
file_hash_lists = file_hash_lists + [ get_hashes_by_tag ( storage_directory , tag ) ]
# take the first of the arrays in the two dimensional array
common_file_hashes = file_hash_lists [ 0 ]
# iterate over the two dimensional array
for file_hash_list in file_hash_lists :
# check each element in common_file_hashes to ensure it is also in all other arrays in the two dimensional array, remove if it isn’ t
for file_hash in common_file_hashes :
if not file_hash in file_hash_list :
common_file_hashes . remove ( file_hash )
2021-12-20 02:04:57 +01:00
for file_hash in common_file_hashes :
lookup_results [ file_hash ] = get_tags_by_hash ( storage_directory , file_hash )
2021-12-20 03:01:56 +01:00
if ' unique ' in command :
if ( len ( lookup_results ) == 1 ) :
print ( json . dumps ( lookup_results ) )
else :
print ( " More than one matching file found. " , file = sys . stderr )
sys . exit ( GENERAL_ERROR )
elif ' first ' in command :
file_hash , tags = list ( lookup_results . items ( ) ) [ 0 ]
print ( json . dumps ( { file_hash : tags } ) )
else :
print ( json . dumps ( lookup_results ) )
2021-12-11 08:01:13 +01:00
2021-12-20 11:48:54 +01:00
# link subcommand: add a symlink in <location> that points to the referenced file
# arguments:
# <storage directory> <hash> <location>
if command [ 0 ] == " link " :
if len ( sys . argv ) < 5 :
print ( " Too few arguments! " , file = sys . stderr )
print ( USAGE , file = sys . stderr )
sys . exit ( USAGE_ERROR )
storage_directory = sys . argv [ 2 ]
file_hash = sys . argv [ 3 ]
link_location = sys . argv [ 4 ]
status , parity , parity_bytes , checksum_algorithm , compress = load_container_settings ( storage_directory )
if not status == 0 :
if status == PATH_ERROR :
print ( " Invalid storage directory! " , file = sys . stderr )
print ( USAGE , file = sys . stderr )
if status == GENERAL_ERROR :
print ( " Verifying container settings failed. " , file = sys . stderr )
sys . exit ( status )
if file_hash_or_path_is_known_hash ( storage_directory , file_hash , compress ) :
if os . path . isdir ( os . sep . join ( link_location . split ( os . sep ) [ : - 1 ] ) ) :
if os . path . exists ( link_location ) :
print ( link_location + " : file already exists. " , file = sys . stderr )
sys . exit ( GENERAL_ERROR )
else :
suffix = " "
if compress :
suffix = " .xz "
object_path = os . path . join ( storage_directory , " objects " , file_hash + suffix )
os . symlink ( object_path , link_location )
print ( link_location + " -> " + object_path )
else :
print ( " Parent directory " + os . sep . join ( link_location . split ( os . sep ) [ : - 1 ] ) + " does not exist. " , file = sys . stderr )
sys . exit ( GENERAL_ERROR )
# help subcommand
if command [ 0 ] == " help " :
print ( USAGE )
sys . exit ( 0 )
2021-12-11 08:01:13 +01:00
2021-12-11 03:16:08 +01:00
# this line is here to work around a bug in Xed