fork download
  1. import os, re
  2. from zipfile import ZipFile
  3.  
  4. email_regex = '[a-z0-9]+[\._]?[a-z0-9]+[@]\w+[.]\w{2,3}'
  5. phone_regex = '[(]?[0-9]{3}[)]?-[0-9]{3}-[0-9]{4}'
  6. ssn_regex = '[0-9]{3}-[0-9]{2}-[0-9]{4}'
  7. regexes = [email_regex, phone_regex, ssn_regex]
  8.  
  9. def findPII(data):
  10. matches = []
  11. for regex in regexes:
  12. m = re.findall(regex, data)
  13. matches += m
  14. return matches
  15.  
  16. def printMatches(filedir, matches):
  17. if len(matches)> 0:
  18. print(filedir)
  19. for match in matches:
  20. print(match)
  21.  
  22. def parseDocx(root, docs):
  23. for doc in docs:
  24. matches = None
  25. filedir = os.path.join(root, doc)
  26. with ZipFile(filedir, "r") as zip:
  27. data = zip.read("word/document.xml")
  28. matches = findPII(data.decode("utf-8"))
  29. printMatches(filedir, matches)
  30.  
  31. def parseText(root, txts):
  32. for txt in txts:
  33. filedir = os.path.join(root, txt)
  34. with open(filedir, "r") as f:
  35. data = f.read()
  36. matches = findPII(data)
  37. printMatches(filedir, matches)
  38.  
  39. txt_ext = [".txt", ".py", ".csv"]
  40.  
  41. def findFiles(directory):
  42. for root, dirs, files in os.walk(directory):
  43. parseDocx(root, [f for f in files if
  44. f.endswith(".docx") ])
  45. for ext in txt_ext:
  46. parseText(root, [f for f in files if
  47. f.endswith(ext)])
  48.  
  49. directory = os.path.join(os.getcwd(),"Documents")
  50. findFiles(directory)
Success #stdin #stdout 0.22s 17816KB
stdin
Standard input is empty
stdout
Standard output is empty