from difflib import HtmlDiff, SequenceMatcher
# Define the paths to the text files
= '0003_bv172tc9618_0003_print.txt'
path_to_file_a = '0003_bv172tc9618_0003_print.txt'
path_to_file_b
# Read the contents of the files
with open(path_to_file_a, 'r', encoding='utf-8') as file:
= file.read()
a
with open(path_to_file_b, 'r', encoding='utf-8') as file:
= file.read()
b
# Calculate the similarity ratio using SequenceMatcher
= SequenceMatcher(None, a, b)
seq_match = seq_match.ratio()
ratio
# Create the HtmlDiff object
= HtmlDiff()
d
# Generate the HTML diff
= d.make_file(a.splitlines(), b.splitlines())
html_diff
# Add the similarity ratio to the HTML diff
= f"<h2>Similarity Ratio: {ratio:.2f}</h2>\n" + html_diff
ratio_html
# Save the diff to an HTML file
with open('0003_bv172tc9618_0003_print_verify.html', 'w', encoding='utf-8') as f:
f.write(ratio_html)
Inspiration for this notebook: https://medium.com/@zhangkd5/a-tutorial-for-difflib-a-powerful-python-standard-library-to-compare-textual-sequences-096d52b4c843
Script to read in directories (folders) for each paper
import os
from difflib import HtmlDiff, SequenceMatcher
# Define the paths to the directories
= './PrintModelText/PrintModelText/zz472cp8582_jpg/txt'
path_to_folder_a = './PrivateModel_V3/PrivateModel_V3/zz472cp8582_jpg_v3/txt'
path_to_folder_b
# Create the HtmlDiff object
= HtmlDiff()
d
# Get the list of files in each directory
= sorted(os.listdir(path_to_folder_a))
files_a = sorted(os.listdir(path_to_folder_b))
files_b
# Ensure both directories have the same number of files
if len(files_a) != len(files_b):
print("Error: The directories do not contain the same number of files.")
# exits program is both directories dont have the same number of files
exit()
# Compare each pair of files
for file_a, file_b in zip(files_a, files_b):
# Read the contents of the files
with open(os.path.join(path_to_folder_a, file_a), 'r', encoding='utf-8') as fa:
= fa.read()
a
with open(os.path.join(path_to_folder_b, file_b), 'r', encoding='utf-8') as fb:
= fb.read()
b
# Calculate the similarity ratio using SequenceMatcher
= SequenceMatcher(None, a, b)
seq_match = seq_match.ratio() #calculates the similarity ratios between a & b
ratio = ratio * 100
ratio_percentage
# Generate the HTML diff report on the differences. splitlines() is used to split the content into individual lines
= d.make_file(a.splitlines(), b.splitlines())
html_diff
# Add the similarity ratio to the HTML diff. Ratio is formatted to two decimal places
= f"<h2>Similarity Ratio: {ratio_percentage:.2f}%</h2>\n" + html_diff
ratio_html
# Define the output file name
= os.path.splitext(file_a)[0] + '_diff.html'
output_file_name
# Save the diff to an HTML file
with open(output_file_name, 'w', encoding='utf-8') as f:
f.write(ratio_html)
print(f"Processed: {output_file_name}")
print("All files have been processed.")
Processed: 0001_zz472cp8582_0001_diff.html
Processed: 0002_zz472cp8582_0002_diff.html
Processed: 0003_zz472cp8582_0003_diff.html
Processed: 0004_zz472cp8582_0004_diff.html
Processed: 0005_zz472cp8582_0005_diff.html
Processed: 0006_zz472cp8582_0006_diff.html
Processed: 0007_zz472cp8582_0007_diff.html
Processed: 0008_zz472cp8582_0008_diff.html
Processed: 0009_zz472cp8582_0009_diff.html
Processed: 0010_zz472cp8582_0010_diff.html
Processed: 0011_zz472cp8582_0011_diff.html
Processed: 0012_zz472cp8582_0012_diff.html
Processed: 0013_zz472cp8582_0013_diff.html
Processed: 0014_zz472cp8582_0014_diff.html
Processed: 0015_zz472cp8582_0015_diff.html
Processed: 0016_zz472cp8582_0016_diff.html
Processed: 0017_zz472cp8582_0017_diff.html
All files have been processed.
Script to print out Similarity Ratio (%) for each page per paper
import os
from difflib import HtmlDiff, SequenceMatcher
# Define the paths to the directories
= './PrintModelText/PrintModelText/bv172tc9618_jpg/txt'
path_to_folder_a = './PrivateModel_V3/PrivateModel_V3/bv172tc9618_jpg_v3/txt'
path_to_folder_b
# Create the HtmlDiff object
= HtmlDiff()
d
# Get the list of files in each directory
= sorted(os.listdir(path_to_folder_a))
files_a = sorted(os.listdir(path_to_folder_b))
files_b
# Ensure both directories have the same number of files
if len(files_a) != len(files_b):
print("Error: The directories do not contain the same number of files.")
exit()
# Store results for the table
= []
results
# Compare each pair of files
for file_a, file_b in zip(files_a, files_b):
# Read the contents of the files
with open(os.path.join(path_to_folder_a, file_a), 'r', encoding='utf-8') as fa:
= fa.read()
a
with open(os.path.join(path_to_folder_b, file_b), 'r', encoding='utf-8') as fb:
= fb.read()
b
# Calculate the similarity ratio using SequenceMatcher
= SequenceMatcher(None, a, b)
seq_match = seq_match.ratio()
ratio = ratio * 100
ratio_percentage
# Generate the HTML diff
= d.make_file(a.splitlines(), b.splitlines())
html_diff
# Add the similarity ratio to the HTML diff
= f"<h2>Similarity Ratio: {ratio_percentage:.2f}%</h2>\n" + html_diff
ratio_html
# Define the output file name
= os.path.splitext(file_a)[0] + '_comp.html'
output_file_name
# Save the diff to an HTML file
with open(output_file_name, 'w', encoding='utf-8') as f:
f.write(ratio_html)
# Print out the processed file
print(f"Processed: {output_file_name}")
# Append the result to the list
= os.path.splitext(file_a)[0]
file_base_name
results.append((file_base_name, ratio_percentage))
print("All files have been processed.")
# Print out the table
print("\nComparison Results:")
print(f"{'File Name and Page Number':<30} {'Similarity Ratio (%)':<20}")
print("=" * 50)
for result in results:
print(f"{result[0]:<30} {result[1]:<20.2f}")
Processed: 0001_bv172tc9618_0001_comp.html
Processed: 0002_bv172tc9618_0002_comp.html
Processed: 0003_bv172tc9618_0003_comp.html
Processed: 0004_bv172tc9618_0004_comp.html
Processed: 0005_bv172tc9618_0005_comp.html
Processed: 0006_bv172tc9618_0006_comp.html
Processed: 0007_bv172tc9618_0007_comp.html
Processed: 0008_bv172tc9618_0008_comp.html
Processed: 0009_bv172tc9618_0009_comp.html
Processed: 0010_bv172tc9618_0010_comp.html
Processed: 0011_bv172tc9618_0011_comp.html
Processed: 0012_bv172tc9618_0012_comp.html
Processed: 0013_bv172tc9618_0013_comp.html
Processed: 0014_bv172tc9618_0014_comp.html
All files have been processed.
Comparison Results:
File Name and Page Number Similarity Ratio (%)
==================================================
0001_bv172tc9618_0001 15.33
0002_bv172tc9618_0002 8.80
0003_bv172tc9618_0003 19.17
0004_bv172tc9618_0004 37.14
0005_bv172tc9618_0005 9.09
0006_bv172tc9618_0006 39.30
0007_bv172tc9618_0007 4.11
0008_bv172tc9618_0008 27.47
0009_bv172tc9618_0009 3.04
0010_bv172tc9618_0010 11.35
0011_bv172tc9618_0011 8.27
0012_bv172tc9618_0012 50.11
0013_bv172tc9618_0013 21.28
0014_bv172tc9618_0014 44.60