hugo_changed_hash.py

Okay, my current, though not necessarily final version.

"""
Script: r:/hugo/scripts/hugo_changed_hash.py

Script to determine which compiled Hugo files have probably changed following
compile based on file hash values. I currently check my personal site output
directory. Was only going to check the netlify one, but there might be
situations where that would not be sufficient.

Going to use MD5 algorithm as not really worried about security here. Just
looking for possibly changed files.

Hashes are saved in/written to a python module (need to manually create file
with empty dictionary for the first run) that is then imported by this one.
There may be more than one such file if I start generating multiple blogs.
  - 'to2c_n': pathlib.Path('r:/hugo/tooOldCode/ndocs'),
  - 'to2c_k': pathlib.Path('r:/hugo/tooOldCode/to2c_k_ca')
The '_k_' indicates the data is for the files in the personal site directory.
An '_n_' will indicate it is for the 'netlify' files. They will be different
as I don't minify the ones on the personal site.

Arguments
----

-w [y|n] - if yes write updated hashes and list of changed files to the data
            module, defaults to 'n'
-n <file name> - specify file name to which to write the hash data dictionary
                 and the list of changed files. Mainly used for testing.

Module/global variables:
----

B_DIRS - dictionary of starting directory by blog
BUF_SIZE - default buffer size for hash_block()
D_HASH - default hashing protocol
DEF_BLOG - default blog if nothing specified on command line
wrt_file - write new hashes/changed files to appropriate data file, defaults to False
chg_files - list of new files or with a different hash value from that previously in the data file

Functions
----

hash_block(file, f_hash): hash 'file' using default block size and 'f_hash' algorithm/function
hash_file(file, f_hash): hash 'file' as one block using 'f_hash' algorithm/function
hash_lines(file, f_hash): like hash_block hash 'file' in pieces, but this time with a readline()

"""

import hashlib
import pathlib
import argparse

# B_DIRS = {
#   'to2c': pathlib.Path('r:/hugo/tOC_bkup/20210124_1027/ndocs')
# }
B_DIRS = {
  'to2c_n': pathlib.Path('r:/hugo/tooOldCode/ndocs'),
  'to2c_k': pathlib.Path('r:/hugo/tooOldCode/to2c_k_ca')
}

# An arbitrary (but fixed) buffer size (change accordingly) 
# 65536 = 65536 bytes = 64 kilobytes  
BUF_SIZE = 65536

# the following will eventually be changeable via cmd ln parameters
# D_HASH = hashlib.md5()
# By doing the above I instantiated the hash object. And an initial value for the hash.
# This value was updated everytime one of the file hashing function was called. It was never
# set to the start value for each execution of the function.
D_HASH = hashlib.md5
DEF_BLOG = 'to2c_k'
# for now, eventually set based on cmd line parameter
b_lbl = DEF_BLOG
h_func = D_HASH


def hash_block(file, f_hash):
  hf = f_hash()
  with open(file, 'rb') as f: 
    while True: 
      data = f.read(BUF_SIZE) 
      if not data: 
          break
      # Passing that data to that default hash function (updating the function with that data) 
      hf.update(data)
      
  # ?.hexdigest() hashes all the input data passed to the hashing function via ?.update() 
  # Acts as a finalize method, after which all the input data gets hashed
  # hexdigest() hashes the data, and returns the output in hexadecimal format 
  return hf.hexdigest()   


# no used, was just testing things at one point
def hash_file(file, f_hash):
  hf = f_hash()
  with open(file, 'rb') as f: 
    data = f.read()
  hf.update(data)
  return hf.hexdigest()


def hash_lines(file, f_hash):
  hf = f_hash()
  with open(file, 'rb') as f:
    f_ln = f.readline()
    while f_ln:
      hf.update(f_ln)
      f_ln = f.readline()
  return hf.hexdigest()


parser = argparse.ArgumentParser(description='Check Hugo compile for changed files')
parser.add_argument('--wrt_file', '-w', help=f'Write hashes to data file?', choices=['n', 'y'], default='n')
parser.add_argument('--file_nm', '-n', help=f'File name to which to write hash & changed file data')

f_hashes = None
if DEF_BLOG == 'to2c_n':
  from to2c_n_hashes import f_hashes
elif DEF_BLOG == 'to2c_k':
  from to2c_k_hashes import f_hashes

file_nm = f"{b_lbl}_hashes.py"
wrt_file = False
chg_files = []
has_chgd = False

args = parser.parse_args()
if args.wrt_file == 'y':
  wrt_file = True
if args.file_nm:
  file_nm = args.file_nm

b_path = B_DIRS[b_lbl]

cnt = 10
for p in b_path.glob("**/*"):
  if p.is_file():
    fl_hash = '0'
    t_size = p.stat().st_size
    if t_size > BUF_SIZE:
      fl_hash = hash_block(p, h_func)
    else:
      fl_hash = hash_file(p, h_func)

    hex_hash = hex(int(fl_hash, base=16))
    fl_path = p.relative_to(b_path).as_posix()
    tst_hash = hex(f_hashes.get(fl_path, 0))
    if cnt < 10:
      print(f"{fl_path}: {fl_hash} => {hex_hash} ?= {tst_hash}")
      flh_2 = hash_file(p, h_func)
      flh_3 = hash_block(p, h_func)
      print(f"\thash_file(): {flh_2} ?= hash_block(): {flh_3}")
      cnt += 1
    if  (tst_hash == 0) or (hex_hash != tst_hash):
      chg_files.append(fl_path)
      if not has_chgd:
        has_chgd = True
    f_hashes[fl_path] = hex_hash

# convert chg_files to list of quoted names
quote_changed = [f'"{fnm}"' for fnm in chg_files]
print(f"changed files ({len(chg_files)}): {chg_files}")

if wrt_file:
  with open(file_nm , 'a') as fout:
    fout.write("\nf_hashes = {")
    for k, h in f_hashes.items():
      fout.write(f"'{k}': {h},\n")
    fout.write("}\n")
    fout.write(f"chg_files = [{', '.join(quote_changed)}]\n")

hugo_changed_len.py

Well, just cuz.

"""
Script: r:/hugo/scripts/hugo_changed_len.py

Script to determine which compiled Hugo files have probably changed following
compile based on file size. I currently check my personal site output
directory. Was only going to check the netlify one, but there might be
situations where that would not be sufficient.

File sizes are saved in/written to a python module (need to manually create file
with empty dictionary for the first run) that is then imported by this one.
There may be more than one such file if I start generating multiple blogs.
  - 'to2c_n': pathlib.Path('r:/hugo/tooOldCode/ndocs'),
  - 'to2c_k': pathlib.Path('r:/hugo/tooOldCode/to2c_k_ca')
The '_k_' indicates the data is for the files in the personal site directory.
An '_n_' will indicate it is for the 'netlify' files. They will be different
as I don't minify the ones on the personal site.

Arguments
----

-w [y|n] - if yes write updated hashes and list of changed files to the data
            module, defaults to 'n'
-n <file name> - specify file name to which to write the hash data dictionary
                 and the list of changed files. Mainly used for testing.

Module/global variables:
----

B_DIRS = dictionary of starting directory by blog
DEF_BLOG - default blog if nothing specified on command line

Functions
----

None

"""
import pathlib
import argparse

B_DIRS = {
  'to2c_n': pathlib.Path('r:/hugo/tooOldCode/ndocs'),
  'to2c_k': pathlib.Path('r:/hugo/tooOldCode/to2c_k_ca')
}
# the following will eventually be changeable via cmd ln parameters
DEF_BLOG = 'to2c_k'
wrt_file = False
# for now, eventually set based on cmd line parameter
b_lbl = DEF_BLOG

parser = argparse.ArgumentParser(description='Check Hugo compile for changed files')
parser.add_argument('--wrt_file', '-w', help=f'Write file sizes to data file?', choices=['n', 'y'], default='n')
parser.add_argument('--file_nm', '-n', help=f'File name to which to write file sizes & changed file data')

f_sizes = None
if DEF_BLOG == 'to2c_n':
  from to2c_n_sizes import f_sizes
elif DEF_BLOG == 'to2c_k':
  from to2c_k_sizes import f_sizes

file_nm = f"{b_lbl}_sizes.py"

args = parser.parse_args()
if args.wrt_file == 'y':
  wrt_file = True
if args.file_nm:
  file_nm = args.file_nm

b_path = B_DIRS[b_lbl]
chg_files = []
for p in b_path.glob("**/*"):
  if p.is_file():
    f_size = p.stat().st_size
    f_path = p.relative_to(b_path).as_posix()
    tst_size = f_sizes.get(f_path, 0)
    if (tst_size == 0) or (f_size != tst_size):
      chg_files.append(f_path)
      f_sizes[f_path] = f_size

# convert chg_files to list of quoted names
quote_changed = [f'"{fnm}"' for fnm in chg_files]
print(f"changed files ({len(chg_files)}): {chg_files}")

if wrt_file:
  with open(file_nm , 'a') as fout:
    fout.write("\nf_sizes = {")
    for k, h in f_sizes.items():
      fout.write(f"'{k}': {h},\n")
    fout.write("}\n")
    fout.write(f"chg_files = [{', '.join(quote_changed)}]\n")