import os, re
from zipfile import ZipFile
email_regex = '[a-z0-9]+[\._]?[a-z0-9]+[@]\w+[.]\w{2,3}'
phone_regex = '[(]?[0-9]{3}[)]?-[0-9]{3}-[0-9]{4}'
ssn_regex = '[0-9]{3}-[0-9]{2}-[0-9]{4}'
regexes = [email_regex, phone_regex, ssn_regex]
def findPII(data):
matches = []
for regex in regexes:
m = re.findall(regex, data)
matches += m
return matches
def printMatches(filedir, matches):
if len(matches)> 0:
print(filedir)
for match in matches:
print(match)
def parseDocx(root, docs):
for doc in docs:
matches = None
filedir = os.path.join(root, doc)
with ZipFile(filedir, "r") as zip:
data = zip.read("word/document.xml")
matches = findPII(data.decode("utf-8"))
printMatches(filedir, matches)
def parseText(root, txts):
for txt in txts:
filedir = os.path.join(root, txt)
with open(filedir, "r") as f:
data = f.read()
matches = findPII(data)
printMatches(filedir, matches)
txt_ext = [".txt", ".py", ".csv"]
def findFiles(directory):
for root, dirs, files in os.walk(directory):
parseDocx(root, [f for f in files if
f.endswith(".docx") ])
for ext in txt_ext:
parseText(root, [f for f in files if
f.endswith(ext)])
directory = os.path.join(os.getcwd(),"Documents")
findFiles(directory)
aW1wb3J0IG9zLCByZQpmcm9tIHppcGZpbGUgaW1wb3J0IFppcEZpbGUKCmVtYWlsX3JlZ2V4ID0gJ1thLXowLTldK1tcLl9dP1thLXowLTldK1tAXVx3K1suXVx3ezIsM30nCnBob25lX3JlZ2V4ID0gJ1soXT9bMC05XXszfVspXT8tWzAtOV17M30tWzAtOV17NH0nCnNzbl9yZWdleCA9ICdbMC05XXszfS1bMC05XXsyfS1bMC05XXs0fScKcmVnZXhlcyA9IFtlbWFpbF9yZWdleCwgcGhvbmVfcmVnZXgsIHNzbl9yZWdleF0KCmRlZiBmaW5kUElJKGRhdGEpOgogICAgbWF0Y2hlcyA9IFtdCiAgICBmb3IgcmVnZXggaW4gcmVnZXhlczoKICAgICAgICBtID0gcmUuZmluZGFsbChyZWdleCwgZGF0YSkKICAgICAgICBtYXRjaGVzICs9IG0KICAgIHJldHVybiBtYXRjaGVzCgpkZWYgcHJpbnRNYXRjaGVzKGZpbGVkaXIsIG1hdGNoZXMpOgogICAgaWYgbGVuKG1hdGNoZXMpPiAwOgogICAgICAgIHByaW50KGZpbGVkaXIpCiAgICAgICAgZm9yIG1hdGNoIGluIG1hdGNoZXM6CiAgICAgICAgICAgIHByaW50KG1hdGNoKQoKZGVmIHBhcnNlRG9jeChyb290LCBkb2NzKToKICAgIGZvciBkb2MgaW4gZG9jczoKICAgICAgICBtYXRjaGVzID0gTm9uZQogICAgICAgIGZpbGVkaXIgPSBvcy5wYXRoLmpvaW4ocm9vdCwgZG9jKQogICAgICAgIHdpdGggWmlwRmlsZShmaWxlZGlyLCAiciIpIGFzIHppcDoKICAgICAgICAgICAgZGF0YSA9IHppcC5yZWFkKCJ3b3JkL2RvY3VtZW50LnhtbCIpCiAgICAgICAgICAgIG1hdGNoZXMgPSBmaW5kUElJKGRhdGEuZGVjb2RlKCJ1dGYtOCIpKQogICAgcHJpbnRNYXRjaGVzKGZpbGVkaXIsIG1hdGNoZXMpCgpkZWYgcGFyc2VUZXh0KHJvb3QsIHR4dHMpOgogICAgZm9yIHR4dCBpbiB0eHRzOgogICAgICAgIGZpbGVkaXIgPSBvcy5wYXRoLmpvaW4ocm9vdCwgdHh0KQogICAgICAgIHdpdGggb3BlbihmaWxlZGlyLCAiciIpIGFzIGY6CiAgICAgICAgICAgIGRhdGEgPSBmLnJlYWQoKQogICAgICAgIG1hdGNoZXMgPSBmaW5kUElJKGRhdGEpCiAgICBwcmludE1hdGNoZXMoZmlsZWRpciwgbWF0Y2hlcykKCnR4dF9leHQgPSBbIi50eHQiLCAiLnB5IiwgIi5jc3YiXQoKZGVmIGZpbmRGaWxlcyhkaXJlY3RvcnkpOgogICAgZm9yIHJvb3QsIGRpcnMsIGZpbGVzIGluIG9zLndhbGsoZGlyZWN0b3J5KToKICAgICAgICBwYXJzZURvY3gocm9vdCwgW2YgZm9yIGYgaW4gZmlsZXMgaWYgCiAgICAgICAgICAgICAgICAgICAgICAgIGYuZW5kc3dpdGgoIi5kb2N4IikgXSkKICAgICAgICBmb3IgZXh0IGluIHR4dF9leHQ6CiAgICAgICAgICAgIHBhcnNlVGV4dChyb290LCBbZiBmb3IgZiBpbiBmaWxlcyBpZgogICAgICAgICAgICAgICAgICAgICAgICAgICAgZi5lbmRzd2l0aChleHQpXSkKICAgICAgICAgICAgCmRpcmVjdG9yeSA9IG9zLnBhdGguam9pbihvcy5nZXRjd2QoKSwiRG9jdW1lbnRzIikKZmluZEZpbGVzKGRpcmVjdG9yeSk=