From 887489c283ba2bbf0538e7eb7c4531ca7c07a769 Mon Sep 17 00:00:00 2001 From: thomas grothe Date: Thu, 30 Nov 2023 11:00:32 -0600 Subject: [PATCH] wow.py now getting all text from the recursive file scan. now to implement grabbing a random file and some random lines from it --- wow.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/wow.py b/wow.py index a9860cf..de563e8 100755 --- a/wow.py +++ b/wow.py @@ -1,8 +1,10 @@ #!/usr/bin/python #Words Of Wisdom -# output some random text from my journal +# output some random text from my journals and writings import os +import sys +import time import filetype #for filetype (extension and mime type) import chardet #for getting encoding import subprocess @@ -19,23 +21,29 @@ def addSampleFileIfTxt(filepath): ftype = filetype.guess(filepath) if v: print('filetype: {}'.format(str(ftype))) if v: print('encoding: {}'.format(enc)) + if ftype != None: + if ftype.extension in ['py', 'c', 'cc', 'h', 'hh', 'java']: #don't want code in the sample data + return + if ftype.extension == 'odt': + if v: print('converting odt: {}'.format(filepath)) + subproc = subprocess.run(['odt2txt', filepath], encoding='utf-8', stdout=subprocess.PIPE) + total_text += str(subproc.stdout) if enc in ['ascii','utf-8']: if v: print(fb) - total_text += str(fb) - elif ftype.extension == 'odt': - total_text += subprocess.run(['odt2txt', filepath]).stdout + total_text += str(fb, encoding='utf-8') + else: if v: print('{} is not txt') else: if v: print('not file') - +#paths=['/home/thomas/doc/fiction'] paths=['/home/thomas/doc/j/', '/home/thomas/_poetry', '/home/thomas/doc/_journal_2019'] #the paths to scan recursively for files from which to grab text samplefiles=[] #the individual files we want to grab text from -v=True #verbose - +v=False #verbose +tStart = time.time() for p in paths: if v: print('path {}'.format(p)) if os.path.isdir(p): @@ -46,6 +54,11 @@ def addSampleFileIfTxt(filepath): else: addSampleFileIfTxt(p) +#now we have all of our files of interest. so pick a random one + +tEnd = time.time() +tDuration = tEnd - tStart +print('report generated in {} seconds, from paths {}'.format(tDuration, str(paths))) print(total_text) #for f in samplefiles: # print(f)