Track changes to doc and docx using Python

Dear colleagues! Please do not judge strictly, as this is my first article.

We have a need to track changes in doc and docx files with fixing the name of the user who made these changes. The files themselves are in a shared folder (yes, yes, shared folders are evil, but I failed to convince them) and you need to know who made the changes. More details under the cut.

We will track the changes in the files themselves using watchdog (pip install watchdog).

Code to track all changes:

# Для отслеживания изменений
import time
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler

# Отслеживаем изменения
class Watcher:
    def __init__(self, path):
        self.observer = Observer()
        self.path = path

    def run(self):
        event_handler = Handler()
        self.observer.schedule(event_handler, self.path, recursive=True)
        self.observer.start()
        try:
            while True:
                time.sleep(1)
        except:
            self.observer.stop()
            print("Error")

        self.observer.join()


class Handler(FileSystemEventHandler):
    @staticmethod
    def on_any_event(event):
        if event.is_directory:
            return None
        print(
            "[{}] noticed: [{}] on: [{}] ".format(
                time.asctime(), event.event_type, event.src_path
            )
        )
        

if __name__ == "__main__":
    w = Watcher('C:\\Users\\user\\Desktop\\')
    w.run()

This code allows you to track changes in all files (without showing who has changed) on the user’s desktop. Naturally, the folder “C:\Users\user\Desktop\” indicated this for an example.

Sample output:

[Mon Dec 5 14:32:37 2022] noticed: [modified] on: [C:\Users\user\Desktop\Документ Microsoft Word.docx]

There is no information about who modified the file. This information can be obtained from the docx file itself.

We need to track changes in doc and docx files, so let’s rewrite the code:

# -*- coding: utf-8 -*-

# Для атрибутов файла
import docx
# Для отслеживания изменений
import time
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler

# Отслеживаем изменения
class Watcher:
    def __init__(self, path):
        self.observer = Observer()
        self.path = path

    def run(self):
        event_handler = Handler()
        self.observer.schedule(event_handler, self.path, recursive=True)
        self.observer.start()
        try:
            while True:
                time.sleep(1)
        except:
            self.observer.stop()
            print("Error")

        self.observer.join()


class Handler(FileSystemEventHandler):
    @staticmethod
    def on_any_event(event):
        # if event.is_directory:
        #     return None
        # print(
        #     "[{}] noticed: [{}] on: [{}] ".format(
        #         time.asctime(), event.event_type, event.src_path
        #     )
        # )
        
        # Получаем атрибуты файла
        if '.tmp' in event.src_path:
            return None
        try:
            document = docx.Document(docx = event.src_path)
            core_properties = document.core_properties
            print(f'{core_properties.last_modified_by} в {core_properties.modified} модифицировал файл {event.src_path}')
        except:
            pass

if __name__ == "__main__":
    w = Watcher('C:\\Users\\user\\Desktop\\')
    w.run()

In this example, the changes are simply printed to the console with the line:

print(f'{core_properties.last_modified_by} in {core_properties.modified} modified file {event.src_path}’)

Sample output:

Ivanov Ivan Ivanovich at 2022-12-05 07:35:00 modified the file C:\Users\user\Desktop\Microsoft Word Document.docx

For logging, you can use the simplest writing to a file:

f = open('change-doc.log','w')  # открытие в режиме записи
f.write(f'{core_properties.last_modified_by} в {core_properties.modified} модифицировал файл {event.src_path}\n')  # запись в файл
f.close()  # закрытие файла

Or you can send changes to the mail (if required):

import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
 
fromaddr = "test@mail.ru"
toaddr = "test2@mail.ru"
mypass = "password"
 
msg = MIMEMultipart()
msg['From'] = fromaddr
msg['To'] = toaddr
msg['Subject'] = f'Модифицирован файл {event.src_path}'
 
body = f'{core_properties.last_modified_by} в {core_properties.modified} модифицировал файл {event.src_path}'
msg.attach(MIMEText(body, 'plain'))
 
server = smtplib.SMTP_SSL('smtp.mail.ru', 465)
server.login(fromaddr, mypass)
text = msg.as_string()
server.sendmail(fromaddr, toaddr, text)
server.quit()

I will also give a complete list of docx file attributes that can be obtained in the same way:

file_name="C:\\Users\\user\\Desktop\\123.docx"

document = docx.Document(docx = file_name)
core_properties = document.core_properties
print('author', core_properties.author)
print('created', core_properties.created)
print('last_modified_by', core_properties.last_modified_by)
print('last_printed', core_properties.last_printed)
print('modified', core_properties.modified)
print('revision', core_properties.revision)
print('title', core_properties.title)
print('category', core_properties.category)
print('comments', core_properties.comments)
print('identifier', core_properties.identifier)
print('keywords', core_properties.keywords)
print('language', core_properties.language)
print('subject', core_properties.subject)
print('version', core_properties.version)
print('keywords', core_properties.keywords)
print('content_status', core_properties.content_status)

Thank you for your attention!!!

Similar Posts

Leave a Reply

Your email address will not be published. Required fields are marked *