## Copyright 2024, Spirion LLC All Rights Reserved ## import os from sys import argv from io import BytesIO from pathlib import Path from datetime import datetime # pip install pillow pypdf python-docx openpyxl python-pptx exiftool # Images from PIL import Image, ImageDraw, ImageFont from exiftool import ExifToolHelper, ExifTool # PDFs from pypdf import PdfWriter, PdfReader # Word from docx import Document from docx.shared import Pt from docx.enum.text import WD_PARAGRAPH_ALIGNMENT # Excel import openpyxl # PowerPoint from pptx import Presentation # # # # # # # # # # # # # C U S T O M I Z E # # # # # # # # # # # # # # # MARK COUNT: the number of times the watermark appears, # # a valaue between 1 and 5 for most document sizes. # MARK_COUNT = 1 # # # # WATERMARK TEXT: the string to be applied as a watermark. # # NOTE: font size decreases as more chars are added; keep # # watermarks between 3 and 30 characters for images and pdfs. # WATERMARK_TEXT = "SPIRION WATERMARK" # # # # LOG PATH: the path for logging. # LOG_PATH = "c:\\temp" # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # def process_file(file_path, WATERMARK_TEXT): _, file_extension = os.path.splitext(file_path) file_extension = file_extension.lower() marked = False # TODO: Add GIF support if file_extension in ['.jpg', '.jpeg', '.png', '.tiff', '.tif', '.bmp']: file_dimensions = get_file_dimensions(file_path) watermark_stamp = generate_watermark(WATERMARK_TEXT, file_dimensions) if file_extension in ['.jpg', '.jpeg', '.png']: has_metadata = get_metadata(file_path) else: has_metadata = False marked = mark_image(file_path, file_extension, watermark_stamp) if has_metadata == True: set_metadata(file_path) elif file_extension == '.pdf': file_dimensions = get_file_dimensions(file_path) watermark_stamp = generate_watermark(WATERMARK_TEXT, file_dimensions) marked = mark_pdf(file_path, watermark_stamp, file_dimensions) elif file_extension == '.docx': marked = mark_docx(file_path, WATERMARK_TEXT) elif file_extension == '.xlsx': marked = mark_xlsx(file_path, WATERMARK_TEXT) elif file_extension == '.pptx': file_dimensions = get_file_dimensions(file_path) watermark_stamp = generate_watermark(WATERMARK_TEXT, file_dimensions) marked = mark_pptx(file_path, watermark_stamp) else: marked = mark_other(file_path) if marked: log_event(log_location, f"MARKED ---- Successfully applied content mark ('{ file_path }')") def mark_image(file_path, file_extension, watermark_stamp): if file_extension == '.jpg': file_extension = '.jpeg' if file_extension == '.tif': file_extension = '.tiff' if file_extension == '.png': target_image = Image.open(file_path).convert('RGBA') else: target_image = Image.open(file_path) target_image.paste(watermark_stamp, (0,0), watermark_stamp) target_image.save(file_path, format = file_extension[1:]) target_image.close() return True # PIL image saving doesn't preserve XMP metadata # TODO: Replace "tags.xmp" with in-memory temp def get_metadata(file_path): with ExifTool() as et: metadata = et.execute("-tagsFromFile", file_path , "-xmp", "tmp.xmp") return True def set_metadata(file_path): with ExifTool() as et: et.execute("-tagsFromFile", "tmp.xmp", "-xmp", file_path, "-overwrite_original") os.remove("tmp.xmp") def mark_pdf(file_path, watermark_stamp, dimensions): image_stream = BytesIO() watermark_stamp.save(image_stream, 'PDF') stamp = PdfReader(image_stream).pages[0] writer = PdfWriter(clone_from=file_path) for page in writer.pages: page_dimensions = [page.mediabox.width, page.mediabox.height] if page_dimensions != dimensions: new_stamp = generate_watermark(WATERMARK_TEXT, page_dimensions) new_stamp.save(image_stream, 'PDF') stamp = PdfReader(image_stream).pages[0] page.merge_page(stamp, over=True) writer.write(file_path) return True def mark_docx(file_path, WATERMARK_TEXT): doc = Document(file_path) for section in doc.sections: header = section.header if header is not None: label_docx(header, WATERMARK_TEXT) footer = section.footer if footer is not None: label_docx(footer, WATERMARK_TEXT) doc.save(file_path) return True def label_docx(part, WATERMARK_TEXT): # Check if the header/footer already contains text if part.paragraphs: existing_text = part.paragraphs[0].text part.paragraphs[0].text = f"{ WATERMARK_TEXT }\n{ existing_text }" else: paragraph = part.add_paragraph() run = paragraph.add_run(WATERMARK_TEXT) run.font.size = Pt(10) # Adjust font size as needed paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER def mark_xlsx(file_path, WATERMARK_TEXT): # Insert as "footer" to minimize impact to existing content wb = openpyxl.load_workbook(file_path) for sheet in wb.worksheets: footer_row = sheet.max_row + 1 sheet.insert_rows(footer_row) cell = sheet['A' + str(footer_row)] cell.value = WATERMARK_TEXT wb.save(file_path) return True def mark_pptx(file_path, watermark_stamp): prs = Presentation(file_path) image_stream = BytesIO() watermark_stamp.save(image_stream, 'PNG') image_stream.seek(0) # NOTE: First arg of "insert(int, image)" controls z axis for slide in prs.slides: picture = slide.shapes.add_picture(image_stream, 0, 0) slide.shapes._spTree.remove(picture._element) slide.shapes._spTree.insert(99, picture._element) prs.save(file_path) return True def mark_other(file_path): log_event(log_location, f"ERROR ----- Unsupported file type ('{ file_path }')") return False ## Dimensions ## # NOTE: Some "get_dimensions" functions went unused ("docx" and "xlsx"). # I had originally wanted to embed background images to all file types, # and hadn't considered that PDFs can have differet image sizes per page. def get_file_dimensions(file_path): _, file_extension = os.path.splitext(file_path) file_extension = file_extension.lower() if file_extension in ['.jpg', '.jpeg', '.png', '.tiff', '.tif', '.bmp', '.gif']: return get_image_dimensions(file_path) elif file_extension == '.pdf': return get_pdf_dimensions(file_path) elif file_extension == '.docx': return get_docx_dimensions(file_path) elif file_extension == '.xlsx': pass elif file_extension == '.pptx': return get_pptx_dimensions(file_path) def get_image_dimensions(file_path): with Image.open(file_path) as img: return img.size def get_pdf_dimensions(file_path): reader = PdfReader(file_path) box = reader.pages[0].mediabox dimensions = [box.width, box.height] return dimensions def get_docx_dimensions(file_path): doc = Document(file_path) sections = doc.sections if sections: section = sections[0] page_width = section.page_width.pt page_height = section.page_height.pt return page_width, page_height else: log_event(log_location, f"ERROR ----- No sections found in file ('{ file_path }')") return None def get_xlsx_dimensions(file_path): workbook = openpyxl.load_workbook(file_path) sheet = workbook.active max_row = sheet.max_row max_column = sheet.max_column return max_row, max_column def get_pptx_dimensions(file_path): presentation = Presentation(file_path) slide_width = int(presentation.slide_width.pt) slide_height = int(presentation.slide_height.pt) return slide_width, slide_height ## Drawing ## # Font scaling def get_font_size(text, width, height): if width < height: font_size = 1.25 * width // len(text) else: font_size = 1.25 * height // len(text) return int(font_size) def get_tile_size(font_size, width, height, MARK_COUNT): # "MARK_COUNT" is set in the CUSTOMIZE section of this script tile_x = width tile_y = height // MARK_COUNT return tile_x, tile_y # TODO: Devise different patterns for watermark stamp def generate_watermark(text, dimensions): width, height = dimensions width = int(width) height = int(height) image = Image.new('RGBA', (width, height), (255, 255, 255, 0)) font_size = get_font_size(text, width, height) tile_size = get_tile_size(font_size, width, height, MARK_COUNT) font = ImageFont.load_default(font_size) draw = ImageDraw.Draw(image, 'RGBA') # NOTE: Tile width is currently set to page width, meaning # 1 column of stamps until adjusted above (get_tile_size) num_tiles_x = (width + tile_size[0] - 1) // tile_size[0] num_tiles_y = (height + tile_size[1] - 1) // tile_size[1] for i in range(num_tiles_x): for j in range(num_tiles_y): x_tile = i * tile_size[0] y_tile = j * tile_size[1] text_bbox = draw.textbbox((0, 0), text, font=font) text_width = text_bbox[2] - text_bbox[0] text_height = text_bbox[3] - text_bbox[1] x_text = x_tile + (tile_size[0] - text_width) // 2 y_text = y_tile + (tile_size[1] - text_height) // 2 # Adjust "fill" value to modify RGBA of watermark draw.text((x_text, y_text), text, font=font, fill=(160, 0,0, 64)) return image ## Logging ## def get_log_path(): now = datetime.now() stamp = now.strftime("%Y%m%d%H") log_name = f"spirion_marking_{ stamp }" log_loc = f"{ LOG_PATH }\\{ log_name }.log" the_path = Path(log_loc) return the_path def get_log_timestamp(): now = datetime.now() timestamp = now.strftime("%Y-%m-%d %H:%M:%S") return timestamp # Creates log if it doesn't exist # NOTE: file rotate hourly by default def set_log(LOG_PATH): f = open(LOG_PATH, "a+") if LOG_PATH.stat().st_size == 0: f.write("TIMESTAMP" + "\t\t" + "LOG MESSAGE" + "\n" + "--------------------------------------" + "\n") else: pass f.close() def log_event(log_location, message): f = open(log_location, "a+") f.write(get_log_timestamp() + "\t" + f"{ message }\n") f.close() # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # if __name__ == "__main__": # Set log path from variable in "CUSTOMIZE" log_location = get_log_path() set_log(log_location) # Handle input # # Define match location from Spirion scan result spirion_path = " ".join(argv[1:]) # Remove quotes (if present -- depends on Spirion script output) if spirion_path[0] == spirion_path[-1] == "'": spirion_path = spirion_path[1:-1] marked_file = process_file(spirion_path, WATERMARK_TEXT) ## Copyright 2024, Spirion LLC All Rights Reserved ##