Inspiration for this notebook: https://medium.com/@zhangkd5/a-tutorial-for-difflib-a-powerful-python-standard-library-to-compare-textual-sequences-096d52b4c843

from difflib import HtmlDiff, SequenceMatcher

# Define the paths to the text files
path_to_file_a = '0003_bv172tc9618_0003_print.txt'
path_to_file_b = '0003_bv172tc9618_0003_print.txt'

# Read the contents of the files
with open(path_to_file_a, 'r', encoding='utf-8') as file:
    a = file.read()

with open(path_to_file_b, 'r', encoding='utf-8') as file:
    b = file.read()

# Calculate the similarity ratio using SequenceMatcher
seq_match = SequenceMatcher(None, a, b)
ratio = seq_match.ratio()

# Create the HtmlDiff object
d = HtmlDiff()

# Generate the HTML diff
html_diff = d.make_file(a.splitlines(), b.splitlines())

# Add the similarity ratio to the HTML diff
ratio_html = f"<h2>Similarity Ratio: {ratio:.2f}</h2>\n" + html_diff

# Save the diff to an HTML file
with open('0003_bv172tc9618_0003_print_verify.html', 'w', encoding='utf-8') as f:
    f.write(ratio_html)

Script to read in directories (folders) for each paper

import os
from difflib import HtmlDiff, SequenceMatcher

# Define the paths to the directories
path_to_folder_a = './PrintModelText/PrintModelText/zz472cp8582_jpg/txt'
path_to_folder_b = './PrivateModel_V3/PrivateModel_V3/zz472cp8582_jpg_v3/txt'

# Create the HtmlDiff object
d = HtmlDiff()

# Get the list of files in each directory
files_a = sorted(os.listdir(path_to_folder_a))
files_b = sorted(os.listdir(path_to_folder_b))

# Ensure both directories have the same number of files
if len(files_a) != len(files_b):
    print("Error: The directories do not contain the same number of files.")
    exit() # exits program is both directories dont have the same number of files

# Compare each pair of files
for file_a, file_b in zip(files_a, files_b):
    # Read the contents of the files
    with open(os.path.join(path_to_folder_a, file_a), 'r', encoding='utf-8') as fa:
        a = fa.read()
    
    with open(os.path.join(path_to_folder_b, file_b), 'r', encoding='utf-8') as fb:
        b = fb.read()
    
    # Calculate the similarity ratio using SequenceMatcher
    seq_match = SequenceMatcher(None, a, b)
    ratio = seq_match.ratio() #calculates the similarity ratios between a & b
    ratio_percentage = ratio * 100
    
    # Generate the HTML diff report on the differences. splitlines() is used to split the content into individual lines
    html_diff = d.make_file(a.splitlines(), b.splitlines())
    
    # Add the similarity ratio to the HTML diff. Ratio is formatted to two decimal places
    ratio_html = f"<h2>Similarity Ratio: {ratio_percentage:.2f}%</h2>\n" + html_diff
    
    # Define the output file name
    output_file_name = os.path.splitext(file_a)[0] + '_diff.html'
    
    # Save the diff to an HTML file
    with open(output_file_name, 'w', encoding='utf-8') as f:
        f.write(ratio_html)
    
    print(f"Processed: {output_file_name}")

print("All files have been processed.")

Processed: 0001_zz472cp8582_0001_diff.html
Processed: 0002_zz472cp8582_0002_diff.html
Processed: 0003_zz472cp8582_0003_diff.html
Processed: 0004_zz472cp8582_0004_diff.html
Processed: 0005_zz472cp8582_0005_diff.html
Processed: 0006_zz472cp8582_0006_diff.html
Processed: 0007_zz472cp8582_0007_diff.html
Processed: 0008_zz472cp8582_0008_diff.html
Processed: 0009_zz472cp8582_0009_diff.html
Processed: 0010_zz472cp8582_0010_diff.html
Processed: 0011_zz472cp8582_0011_diff.html
Processed: 0012_zz472cp8582_0012_diff.html
Processed: 0013_zz472cp8582_0013_diff.html
Processed: 0014_zz472cp8582_0014_diff.html
Processed: 0015_zz472cp8582_0015_diff.html
Processed: 0016_zz472cp8582_0016_diff.html
Processed: 0017_zz472cp8582_0017_diff.html
All files have been processed.

Script to print out Similarity Ratio (%) for each page per paper

import os
from difflib import HtmlDiff, SequenceMatcher

# Define the paths to the directories
path_to_folder_a = './PrintModelText/PrintModelText/bv172tc9618_jpg/txt'
path_to_folder_b = './PrivateModel_V3/PrivateModel_V3/bv172tc9618_jpg_v3/txt'

# Create the HtmlDiff object
d = HtmlDiff()

# Get the list of files in each directory
files_a = sorted(os.listdir(path_to_folder_a))
files_b = sorted(os.listdir(path_to_folder_b))

# Ensure both directories have the same number of files
if len(files_a) != len(files_b):
    print("Error: The directories do not contain the same number of files.")
    exit()

# Store results for the table
results = []

# Compare each pair of files
for file_a, file_b in zip(files_a, files_b):
    # Read the contents of the files
    with open(os.path.join(path_to_folder_a, file_a), 'r', encoding='utf-8') as fa:
        a = fa.read()
    
    with open(os.path.join(path_to_folder_b, file_b), 'r', encoding='utf-8') as fb:
        b = fb.read()
    
    # Calculate the similarity ratio using SequenceMatcher
    seq_match = SequenceMatcher(None, a, b)
    ratio = seq_match.ratio()
    ratio_percentage = ratio * 100
    
    # Generate the HTML diff
    html_diff = d.make_file(a.splitlines(), b.splitlines())
    
    # Add the similarity ratio to the HTML diff
    ratio_html = f"<h2>Similarity Ratio: {ratio_percentage:.2f}%</h2>\n" + html_diff
    
    # Define the output file name
    output_file_name = os.path.splitext(file_a)[0] + '_comp.html'
    
    # Save the diff to an HTML file
    with open(output_file_name, 'w', encoding='utf-8') as f:
        f.write(ratio_html)
    
    # Print out the processed file
    print(f"Processed: {output_file_name}")
    
    # Append the result to the list
    file_base_name = os.path.splitext(file_a)[0]
    results.append((file_base_name, ratio_percentage))

print("All files have been processed.")

# Print out the table
print("\nComparison Results:")
print(f"{'File Name and Page Number':<30} {'Similarity Ratio (%)':<20}")
print("=" * 50)
for result in results:
    print(f"{result[0]:<30} {result[1]:<20.2f}")

Processed: 0001_bv172tc9618_0001_comp.html
Processed: 0002_bv172tc9618_0002_comp.html
Processed: 0003_bv172tc9618_0003_comp.html
Processed: 0004_bv172tc9618_0004_comp.html
Processed: 0005_bv172tc9618_0005_comp.html
Processed: 0006_bv172tc9618_0006_comp.html
Processed: 0007_bv172tc9618_0007_comp.html
Processed: 0008_bv172tc9618_0008_comp.html
Processed: 0009_bv172tc9618_0009_comp.html
Processed: 0010_bv172tc9618_0010_comp.html
Processed: 0011_bv172tc9618_0011_comp.html
Processed: 0012_bv172tc9618_0012_comp.html
Processed: 0013_bv172tc9618_0013_comp.html
Processed: 0014_bv172tc9618_0014_comp.html
All files have been processed.

Comparison Results:
File Name and Page Number      Similarity Ratio (%)
==================================================
0001_bv172tc9618_0001          15.33               
0002_bv172tc9618_0002          8.80                
0003_bv172tc9618_0003          19.17               
0004_bv172tc9618_0004          37.14               
0005_bv172tc9618_0005          9.09                
0006_bv172tc9618_0006          39.30               
0007_bv172tc9618_0007          4.11                
0008_bv172tc9618_0008          27.47               
0009_bv172tc9618_0009          3.04                
0010_bv172tc9618_0010          11.35               
0011_bv172tc9618_0011          8.27                
0012_bv172tc9618_0012          50.11               
0013_bv172tc9618_0013          21.28               
0014_bv172tc9618_0014          44.60