filetags/filetags.py

#!/usr/bin/python3

import sys, hashlib, os, sqlite3, shutil

################################################################################
# program wide constants
################################################################################

GENERAL_ERROR=1
USAGE_ERROR=2
PATH_ERROR=3

################################################################################
# functions
################################################################################

def calculate_file_hash(algorithm, file_path, segment_size=4096):
  file_descriptor = open(file_path, "rb")
  hash_object = hashlib.new(algorithm)
  segment = file_descriptor.read(segment_size)
  while not segment==b'':
    hash_object.update(segment)
    segment = file_descriptor.read(segment_size)
  return hash_object.hexdigest()

def calculate_parity(byte_string):
  parity_byte = 0b00000000
  for i in range(len(byte_string)):
    parity_byte = parity_byte ^ byte_string[i]
  return parity_byte

def gegerate_parity_file(input_path, parity_bytes, output_path):
  input_file = open(input_path, "rb")
  output_file = open(output_path, "wb")

  segment = input_file.read(parity_bytes)
  while not segment==b'':
    output_file.write((calculate_parity(segment)).to_bytes(1, byteorder='big'))
    segment = input_file.read(parity_bytes)

  input_file.close()
  output_file.close()

def create_container(storage_directory, parity=False, parity_bytes=512, checksum_algorithm='sha512', compress=False):
  # prepare storage directory
  os.makedirs(storage_directory)
  os.mkdir(os.path.join(storage_directory, "objects"))
  if (parity):
    os.mkdir(os.path.join(storage_directory, "parity"))

  db = sqlite3.connect(os.path.join(storage_directory, "container.sqlite"))
  cursor = db.cursor()
  # settings
  cursor.execute("CREATE TABLE settings (option TEXT, value TEXT);")
  cursor.execute("INSERT INTO settings VALUES ('parity', ?);", (str(parity),))
  cursor.execute("INSERT INTO settings VALUES ('parity_bytes', ?);", (str(parity_bytes),))
  cursor.execute("INSERT INTO settings VALUES ('checksum_algorithm', ?);", (checksum_algorithm,))
  cursor.execute("INSERT INTO settings VALUES ('compress', ?);", (str(compress),))
  # container
  cursor.execute("CREATE TABLE hashes (id INTEGER PRIMARY KEY, hash TEXT UNIQUE);") # primary key somehow makes this similar to auto-increment (not actually auto increment but good enough for my use)
  cursor.execute("CREATE TABLE tags (id INTEGER, tag TEXT);")
  db.commit()
  db.close()

def add_tag(storage_directory, file_hash, tag):
  db = sqlite3.connect(os.path.join(storage_directory, "container.sqlite"))
  cursor = db.cursor()

  cursor.execute("SELECT id FROM hashes WHERE hash=?", (file_hash,))
  row = cursor.fetchone()
  if row == None:
    cursor.execute("INSERT INTO hashes (hash) VALUES (?)", (file_hash,))
    db.commit()
    cursor.execute("SELECT id FROM hashes WHERE hash=?", (file_hash,))
    row = cursor.fetchone()
  internal_id = row[0]

  cursor.execute("SELECT tag FROM tags WHERE id=?", (internal_id,))
  rows = cursor.fetchall()
  tags = []
  for row in rows:
    tags = tags+[row[0]]

  tag_already_present = True
  if not tag in tags:
    tag_already_present = False
    cursor.execute("INSERT INTO tags (id, tag) VALUES (?, ?)", (internal_id, tag))
    db.commit()

  db.close()
  return not tag_already_present

def get_tags_by_hash(storage_directory, file_hash):
  db = sqlite3.connect(os.path.join(storage_directory, "container.sqlite"))
  cursor = db.cursor()

  tags = None
  cursor.execute("SELECT id FROM hashes WHERE hash=?", (file_hash,))
  row = cursor.fetchone()
  if not row == None:
    internal_id = row[0]
    cursor.execute("SELECT tag FROM tags WHERE id=?", (internal_id,))
    rows = cursor.fetchall()
    tags = []
    for row in rows:
      tags = tags+[row[0]]

  db.close()
  return tags

def get_hashes_by_tag(storage_directory, tag):
  db = sqlite3.connect(os.path.join(storage_directory, "container.sqlite"))
  cursor = db.cursor()

  cursor.execute("SELECT id FROM tags WHERE tag=?", (tag,))
  rows = cursor.fetchall()
  internal_ids = []
  for row in rows:
    internal_ids = internal_ids+[row[0]]

  file_hashes = []
  for internal_id in internal_ids:
    cursor.execute("SELECT hash FROM hashes WHERE id=?", (internal_id,))
    row = cursor.fetchone()
    file_hashes = file_hashes+[row[0]]

  return file_hashes

def file_is_in_storage(storage_directory, file_hash_or_path, compress):
  suffix=""
  if compress:
    suffix=".xz"
  #that last part is needed bc os.path.join automatically discards everything before the last element containing an absolute path so if file_hash_or_path is an absolute path that would be what join gives back
  return os.path.isfile(os.path.join(storage_directory, "objects", file_hash_or_path+suffix)) and not os.path.isfile(file_hash_or_path)

def load_container_settings(storage_directory):
  if not os.path.isfile(os.path.join(storage_directory, "container.sqlite")):
    return (PATH_ERROR, None, None, None, None)

  db = sqlite3.connect(os.path.join(storage_directory, "container.sqlite"))
  cursor = db.cursor()
  cursor.execute("SELECT option, value FROM settings")

  # set default values and then read the db, just in case...
  parity             = False
  parity_bytes       = 512
  checksum_algorithm = "sha512"
  compress           = False
  for row in cursor.fetchall():
    if row[0]=="parity":
      parity = row[1]==str(True)
    if row[0]=="parity_bytes":
      parity_bytes = int(row[1])
    if row[0]=="checksum_algorithm":
      checksum_algorithm = row[1]
    if row[0]=="compress":
      compress = row[1]==str(True)
  db.close()

  # check storage container against settings
  #TODO: check compression ?
  if parity and not os.path.isdir(os.path.join(storage_directory, "parity")):
    return (GENERAL_ERROR, None, None, checksum_algorithm, compress)
  return (0, parity, parity_bytes, checksum_algorithm, compress)

################################################################################
# main program
################################################################################

if __name__ == "__main__":

  USAGE="""Usage:
  """+sys.argv[0]+""" create <storage directory> [parity=<on|off>] [parity-bytes=<number of bytes for each parity byte>] [checksum-algorithm=<algorithm>] [compress=<on|off>] - set up a new storage directory
  """+sys.argv[0]+""" add <storage directory> <hash|file> <tags ...> - add tags to a file in the storage, if file is not already in the storage, same as add+file
  """+sys.argv[0]+""" add+file <storage directory> <file> <tags ...> - copy a file to the storage and add tags
  """+sys.argv[0]+""" add+file+move <storage directory> <file> <tags ...> - move a file to the storage and add tags
  """+sys.argv[0]+""" remove <storage directory> <hash|unique tag or tag set> - remove a file from the storage, return error if not found or multiple found
  """+sys.argv[0]+""" remove+multi <storage directory> <exact tag or set of exact tags> - remove a ll found files from the storage, return error if not found
  """+sys.argv[0]+""" search <storage directory> <tags or partial tags> - return paths and tags of all found files
  """+sys.argv[0]+""" search+first <storage directory> <tags or partial tags> - return hash and tags of first found file
  """+sys.argv[0]+""" search+unique <storage directory> <tags or partial tags> - return hash and tags of the found file, return error if not found or multiple found
  """+sys.argv[0]+""" lookup <storage directory> <hash|exact tag|set of exact tags> - return hash and tags of all found files
  """+sys.argv[0]+""" lookup+first <storage directory> <hash|exact tag|set of exact tags> - return hash and tags of first found file
  """+sys.argv[0]+""" lookup+unique <storage directory> <hash|exact tag|set of exact tags> - return hash and tags of the found file, return error if not found or multiple found
  """+sys.argv[0]+""" link <storage directory> <hash> <location> - add a symlink in <location> that points to the referenced file
  """+sys.argv[0]+""" check <storage directory> <hash> - check file contents against hash
  """+sys.argv[0]+""" check+parity <storage directory> <hash> - check file contents against hash and parity file
  """+sys.argv[0]+""" check+all <storage directory> - check all files against their hashes
  """+sys.argv[0]+""" check+all+parity <storage directory> - check all files against their hashes and parity files
  """+sys.argv[0]+""" update <storage directory> <hash> - update the hash (and parity if applickable) of the specified file (specify by previous hash)
  """+sys.argv[0]+""" update+all <storage directory> - update hashes (and parities if applickable) of all mismatching files
  """+sys.argv[0]+""" fix <storage directory> <hash> - attempt to fix the file using parity
  """+sys.argv[0]+""" fix+all <storage directory> - attempt to fix all files using parity
  """+sys.argv[0]+""" help - display this message
  """
  #TODO: +path modifier for things that return a hash to return the path to the stored file instead
  #TODO: +hash and +tags modifier for lookup
  #TODO: condense modifiers onto the same lines as the main subcommand where possible
  #TODO: clarification of <> and []
  #TODO: subcommand to change container settings
  VALID_COMMANDS=["create", "add", "remove", "search", "lookup", "link", "check", "update", "fix", "help"]

  try:
    command = sys.argv[1].split("+")
  except IndexError:
    print("No subcommand specified.", file=sys.stderr)
    print(USAGE, file=sys.stderr)
    sys.exit(USAGE_ERROR)
  if not command[0] in VALID_COMMANDS:
    print("Invalid command: "+command[0], file=sys.stderr)
    print(USAGE, file=sys.stderr)
    sys.exit(USAGE_ERROR)

  # help subcommand
  if command[0] == "help":
    print(USAGE)
    sys.exit(0)

  # create subcommand: create a new directory containing a folder for stored objects, one for parity files and one for
  # arguments: <storage directory> [parity=<on|off>] [parity-bytes=<number of bytes for each parity byte>] [checksum-algorithm=<algorithm>] [compress=<on|off>]
  if command[0] == "create":
    if len(sys.argv)<3:
      print("Too few arguments!", file=sys.stderr)
      print(USAGE, file=sys.stderr)
      sys.exit(USAGE_ERROR)

    storage_directory=sys.argv[2]
    if os.path.exists(storage_directory):
      print("Target path already exists. Please choose a different location.", file=sys.stderr)
      sys.exit(GENERAL_ERROR)

    # default options
    parity             = False
    parity_bytes       = 512
    checksum_algorithm = "sha512"
    compress           = False
    # check for command line options
    if len(sys.argv)>3:
      arguments = sys.argv[3:]
      for argument in arguments:
        if not len(argument.split("="))==2:
          print("Arguments to \"create\" always follow the scheme <option>=<value>.", file=sys.stderr)
          print(USAGE, file=sys.stderr)
          sys.exit(USAGE_ERROR)
        option = argument.split("=")[0]
        value  = argument.split("=")[1]
        if not option in ["parity", "parity-bytes", "checksum-algorithm", "compress"]:
          print("Unknown option: "+option, file=sys.stderr)
          print(USAGE, file=sys.stderr)
          sys.exit(USAGE_ERROR)
        if option=="parity":
          if not value in ["on", "off"]:
            print("Option \"parity\" accepts either \"on\" or \"off\".", file=sys.stderr)
            sys.exit(USAGE_ERROR)
          if value=="on":
            parity = True
        if option=="parity-bytes":
          try:
            parity_bytes = int(value)
          except ValueError:
            print("Option \"parity-bytes\" only accepts integers.", file=sys.stderr)
            sys.exit(USAGE_ERROR)
        if option=="checksum-algorithm":
          if not value in hashlib.algorithms_available:
            print("Chacksum algorithm \""+value+"\" not available.")
            sys.exit(USAGE_ERROR)
          checksum_algorithm = value
        if option=="compress":
          if not value in ["on", "off"]:
            print("Option \"compress\" accepts either \"on\" or \"off\".", file=sys.stderr)
            sys.exit(USAGE_ERROR)
          if value=="on":
            compress = True

    create_container(storage_directory, parity=parity, parity_bytes=parity_bytes, checksum_algorithm=checksum_algorithm, compress=compress)
    sys.exit(0)

  # add subcommand: add a file to the storage container or add tags to it
  # arguments:
  #  <storage directory> <hash|file> <tags ...>
  # modifiers:
  #  file - requires a file path; adds a new file (or if file already in storage adds tags to that file), checks for collisions by comparing file size
  #  move - requires file modifier; moves the file to the storage dir instead of copying it
  if command[0] == "add":
    if len(sys.argv)<5:
      print("Too few arguments!", file=sys.stderr)
      print(USAGE, file=sys.stderr)
      sys.exit(USAGE_ERROR)

    storage_directory = sys.argv[2]
    status, parity, parity_bytes, checksum_algorithm, compress = load_container_settings(storage_directory)
    if not status==0:
      if status==PATH_ERROR:
        print("Invalid storage directory!", file=sys.stderr)
        print(USAGE, file=sys.stderr)
      if status==GENERAL_ERROR:
        print("Verifying container settings failed.", file=sys.stderr)
      sys.exit(status)

    file_hash_or_path = sys.argv[3]
    hash_allowed=True
    if 'file' in command:
      hash_allowed=False
    if not any([hash_allowed and file_is_in_storage(storage_directory, file_hash_or_path, compress), os.path.isfile(file_hash_or_path)]):
      print("Unknown file!", file=sys.stderr)
      print(USAGE, file=sys.stderr)
      sys.exit(PATH_ERROR)

    tags = sys.argv[4:]

    if hash_allowed and file_is_in_storage(storage_directory, file_hash_or_path, compress):
      file_hash = file_hash_or_path
      print("File already in storage.")
    else:
      file_hash = calculate_file_hash(checksum_algorithm, file_hash_or_path)
      if file_is_in_storage(storage_directory, file_hash_or_path, compress):
        print("File already in storage.")
        #this assumes that the storage directory has not been tampered with or corrupted, FIXME!
        if 'move' in command:
          print("Removing external file.")
          os.remove(file_hash_or_path)
      else:
        if 'move' in command:
          print("Moving file to storage.")
          shutil.move(file_hash_or_path, os.path.join(storage_directory, "objects", file_hash))
        else:
          print("Copying file to storage.")
          shutil.copyfile(file_hash_or_path, os.path.join(storage_directory, "objects", file_hash))
        if parity:
          gegerate_parity_file(os.path.join(storage_directory, "objects", file_hash), parity_bytes, os.path.join(storage_directory, "parity", file_hash))
        if compress:
          print("Compressing...")
          xz_process = os.popen("xz --best -T0 "+os.path.join(storage_directory, "objects", file_hash))
          if parity:
            xz_process = os.popen("xz --best -T0 "+os.path.join(storage_directory, "parity", file_hash))

    for tag in tags:
      print("Adding tag: "+tag)
      if add_tag(storage_directory, file_hash, tag):
        print("Added.")
      else:
        print("Tag already present.")

  # lookup subcommand: return hash and tags of found files
  # arguments: <storage directory> <hash|exact tag|set of exact tags>
  # modifiers:
  #  first - only return one file
  #  unique - return error if not found or multiple found
  #TODO: modifiers
  if command[0] == "lookup":
    if len(sys.argv)<4:
      print("Too few arguments!", file=sys.stderr)
      print(USAGE, file=sys.stderr)
      sys.exit(USAGE_ERROR)

    storage_directory = sys.argv[2]
    status, parity, parity_bytes, checksum_algorithm, compress = load_container_settings(storage_directory)
    if not status==0:
      if status==PATH_ERROR:
        print("Invalid storage directory!", file=sys.stderr)
        print(USAGE, file=sys.stderr)
      if status==GENERAL_ERROR:
        print("Verifying container settings failed.", file=sys.stderr)
      sys.exit(status)

    file_tags_or_hash = sys.argv[3:]
    if file_is_in_storage(storage_directory, file_tags_or_hash[0], compress):
      tags = get_tags_by_hash(storage_directory, file_tags_or_hash[0])
      print("Tags for file:")
      print(tags)

    # create a two dimensional array of all the files associated with each individual tag
    file_hash_lists = []
    for tag in file_tags_or_hash:
      file_hash_lists = file_hash_lists + [get_hashes_by_tag(storage_directory, tag)]
    # take the first of the arrays in the two dimensional array
    common_file_hashes = file_hash_lists[0]
    # iterate over the two dimensional array
    for file_hash_list in file_hash_lists:
      # check each element in common_file_hashes to ensure it is also in all other arrays in the two dimensional array, remove if it isn’t
      for file_hash in common_file_hashes:
        if not file_hash in file_hash_list:
          common_file_hashes.remove(file_hash)

    if not common_file_hashes == []:
      print("Files for tag(s):")
      print(common_file_hashes)


# this line is here to work around a bug in Xed