From a39b48265eb4d241c6c920a10661efeb9830db39 Mon Sep 17 00:00:00 2001 From: ssstvinc2 Date: Sun, 12 Feb 2017 11:18:30 -0500 Subject: Added try/except pairs to handle parsing errors; reduced terminal output --- unbiasedFunctions.py | 102 ++++++++++++++++++++++++++++----------------------- 1 file changed, 57 insertions(+), 45 deletions(-) diff --git a/unbiasedFunctions.py b/unbiasedFunctions.py index 2cdae81..8caa919 100644 --- a/unbiasedFunctions.py +++ b/unbiasedFunctions.py @@ -6,9 +6,12 @@ import time #take in a url and delimiters, return twitter card def buildArticle(url, sourceName):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd): + ''' PRINT DEBUGGING print(sourceName) print(url) print() + ''' + #download url os.system('wget -q -O scratch/temp_article.html --no-check-certificate '+url) @@ -79,41 +82,48 @@ def extractURLs(content, source): h2s=[] h3s=[] - h1=content - if source.h1SectionDividerStart!=None: - h1=h1.split(source.h1SectionDividerStart)[1] - if source.h1SectionDividerEnd!=None: - h1=h1.split(source.h1SectionDividerEnd)[0] - for delim in source.h1DelStart: - h1=h1.split(delim)[1] - h1=h1.split(source.h1DelEnd)[0] - if '.com' not in h1: - if source.stubURL!=None: - h1=source.stubURL+h1 - else: - h1=source.url+h1 - h1s.append(h1) - - h2=content - if source.h2SectionDividerStart!=None: - h2=h2.split(source.h2SectionDividerStart, 1)[1] - if source.h2SectionDividerEnd!=None: - h2=h2.split(source.h2SectionDividerEnd, 1)[0] - - while source.h2DelStart[0] in h2: - x=h2 - for delim in source.h2DelStart: - x=x.split(delim)[1] - h2=h2.split(delim, 1)[1] - x=x.split(source.h2DelEnd)[0] - h2=h2.split(source.h2DelEnd, 1)[1] - if '.com' not in x: + try: + h1=content + if source.h1SectionDividerStart!=None: + h1=h1.split(source.h1SectionDividerStart)[1] + if source.h1SectionDividerEnd!=None: + h1=h1.split(source.h1SectionDividerEnd)[0] + for delim in source.h1DelStart: + h1=h1.split(delim)[1] + h1=h1.split(source.h1DelEnd)[0] + if '.com' not in h1: if source.stubURL!=None: - x=source.stubURL+x + h1=source.stubURL+h1 else: - x=source.url+x - h2s.append(x) - + h1=source.url+h1 + h1s.append(h1) + except: + print("Parse error in extractURLs: "+source.name+" h1") + h1s=None + + try: + h2=content + if source.h2SectionDividerStart!=None: + h2=h2.split(source.h2SectionDividerStart, 1)[1] + if source.h2SectionDividerEnd!=None: + h2=h2.split(source.h2SectionDividerEnd, 1)[0] + + while source.h2DelStart[0] in h2: + x=h2 + for delim in source.h2DelStart: + x=x.split(delim)[1] + h2=h2.split(delim, 1)[1] + x=x.split(source.h2DelEnd)[0] + h2=h2.split(source.h2DelEnd, 1)[1] + if '.com' not in x: + if source.stubURL!=None: + x=source.stubURL+x + else: + x=source.url+x + h2s.append(x) + except: + print("Parse error in extractURLs: "+source.name+" h2") + h2s=None return h1s, h2s, h3s @@ -146,7 +156,6 @@ def buildOutput(newsSourceArr): for i in range(len(h2RandomSources)): source=newsSourceArr[h2RandomSources[i]] - print(source.name) randomArticle=random.sample(range(len(source.h2Arr)), 1)[0] article=source.h2Arr[randomArticle] template=template.replace('xxURL2-'+str(i+1)+'xx', article.url) @@ -157,7 +166,7 @@ def buildOutput(newsSourceArr): for i in range(len(newsSourceArr)-1): sourcesStr+=newsSourceArr[i].name+', ' sourcesStr+=newsSourceArr[-1].name - print(sourcesStr) + print('Successfully parsed: '+sourcesStr) template=template.replace('xxSourcesxx', sourcesStr) @@ -194,16 +203,19 @@ def buildNewsSourceArr(sourceList): h1s, h2s, h3s=extractURLs(content, source) #build the Article objects and add to newsSource's appropriate list - for url in h1s: - article=buildArticle(url, source.name) - source.addArticle(article, 1) #sourceList[i].h1Arr.append(article) - for url in h2s: - article=buildArticle(url, source.name) - sourceList[i].h2Arr.append(article) - for url in h3s: - article=buildArticle(url, source.name) - sourceList[i].h3Arr.append(article) - + if h1s!=None and h2s!=None: + for url in h1s: + article=buildArticle(url, source.name) + source.addArticle(article, 1) #sourceList[i].h1Arr.append(article) + for url in h2s: + article=buildArticle(url, source.name) + sourceList[i].h2Arr.append(article) + for url in h3s: + article=buildArticle(url, source.name) + sourceList[i].h3Arr.append(article) + else: + sourceList.remove(source) + #return the original sourceList, #since everything should have been modified in place return sourceList -- cgit v1.2.3