From 233eb048a9bc2c4b84e1ae6a47de6b088779ee4e Mon Sep 17 00:00:00 2001 From: sstvinc2 Date: Wed, 15 Feb 2017 23:33:56 -0600 Subject: Fixed NYT, plus other parsing fixes and a minor visual tweak --- html_template/unbiased.css | 2 +- main.py | 7 ++++--- parser.py | 19 ++++++++++++++----- unbiasedFunctions.py | 26 ++++++++++++++++++-------- 4 files changed, 37 insertions(+), 17 deletions(-) diff --git a/html_template/unbiased.css b/html_template/unbiased.css index 86f653a..90c604a 100644 --- a/html_template/unbiased.css +++ b/html_template/unbiased.css @@ -69,7 +69,7 @@ a:link, a:visited, a:hover, a:active { vertical-align:top; text-align:left; width:360px; - height:322px; + height:352px; overflow:hidden; background:#fff; margin:10px; diff --git a/main.py b/main.py index 92f96ae..296de05 100644 --- a/main.py +++ b/main.py @@ -15,6 +15,10 @@ def main(): def run(): sourceList=[] + + nyt=buildNYT() + sourceList.append(nyt) + blz=buildBlaze() sourceList.append(blz) @@ -30,9 +34,6 @@ def run(): wkl=buildWeeklyStandard() sourceList.append(wkl) - #nyt=buildNYT() - #sourceList.append(nyt) - fox=buildFoxNews() sourceList.append(fox) diff --git a/parser.py b/parser.py index 53b3261..ef90eee 100644 --- a/parser.py +++ b/parser.py @@ -225,6 +225,8 @@ def buildBlaze(): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) blz=buildNewsSource2(name, url, h1s, h2s, h3s) + blz=removeBadStories(blz, None, ['Tomi Lahren'], None) + #The Blaze has dumb, short description fields, so we need to grab #the first x characters of actual article text instead blz.h1Arr=blazeFixDesc(blz.h1Arr) @@ -502,10 +504,17 @@ def buildNYT(): #this will likely need if/else logic h1=content - #This is with a large headline over a and b columns - h1=h1.split('story theme-summary banner', 1)[1] - h1=h1.split('', 1)[1] + h1=h1.split('')[0] - author=author[:-1] - break + if sourceName!='The Blaze': + authorTags=['article:author', 'dc.creator'] + for tag in authorTags: + if tag in content: + author=content.split(tag+'" content=')[1][1:].split('>')[0] + author=author[:-1] + break + #handle The Blaze + else: + if 'class="article-author">' in content: + author=content.split('class="article-author">')[1].split('<')[0] + elif 'class="article-author" href="' in content: + author=content.split('class="article-author" href="')[1] + author=author.split('>')[1].split('<')[0].strip() description=content.split('og:description" content=')[1][1:].split('>')[0] if description[-1]=='/': description=description[:-1].strip() description=description[:-1] + #strip out self-references + description=description.replace(sourceName, 'our') a=Article(title, url, img, description, sourceName, author) return a except: - print("Article parsing error in buildArticle() for URL: "+url+" in source"+sourceName) + print("Article parsing error in buildArticle() for URL: "+url+" in source "+sourceName+'\n') return None @@ -63,7 +73,7 @@ def buildOutput(newsSourceArr): #set the random order for sources h1RandomSources=random.sample(range(len(newsSourceArr)), 4) #For h2s and h3s, select N random sources (can repeat), then - #a non-repetitive random article from within that source + #a non-repetitive random article from within h2RandomPairs=[] while len(h2RandomPairs) < 6: x=random.sample(range(len(newsSourceArr)), 1)[0] -- cgit v1.2.3