From ef0dc339f42c6befd07f0d626c1eaed8ad7ee057 Mon Sep 17 00:00:00 2001 From: ssstvinc2 Date: Fri, 24 Mar 2017 19:05:54 -0400 Subject: Added ABC News, some parser fixes as well --- main.py | 4 +- parser.py | 124 +++++++++++++++++++++++++++++++++++++++++++++++++-- spotCheck.py | 4 +- unbiasedFunctions.py | 5 ++- 4 files changed, 128 insertions(+), 9 deletions(-) diff --git a/main.py b/main.py index 735ff6b..a109d2f 100755 --- a/main.py +++ b/main.py @@ -21,9 +21,7 @@ def run(): SOURCES TO ADD NEXT: -ABC -REUTERS - -Christian Science Monitor -Town Hall - -Washington Times ''' @@ -31,7 +29,7 @@ def run(): ### These values have to be the second half of the function name ### E.g. Guardian calls buildGuardian(), etc. sourceFnArr=['Guardian', 'TheHill', 'NPR', 'BBC', 'NBC', 'CBS', - 'FoxNews', 'WashTimes'] #'Blaze' + 'FoxNews', 'WashTimes', 'CSM', 'ABC'] #'Blaze' for source in sourceFnArr: tries=0 diff --git a/parser.py b/parser.py index 942612a..cf56d13 100755 --- a/parser.py +++ b/parser.py @@ -32,19 +32,28 @@ Returns a newsSource2 object ''' def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs): h1Arr=[] - h1Arr.append(buildArticle(h1URLs[0], name)) + a=buildArticle(h1URLs[0], name) + if a==None: + print('................\nH1 Nonetype in '+name+'\n................') + else: + h1Arr.append(a) h2Arr=[] for x in h2URLs: a=buildArticle(x, name) if a!=None: h2Arr.append(a) + else: + print('................\nH2 Nonetype in '+name+'\n................') + h3Arr=[] for x in h3URLs: a=buildArticle(x, name) if a!=None: h3Arr.append(a) + else: + print('................\nH3 Nonetype in '+name+'\n................') #BUILD THE NEWS SOURCE newsSource=NewsSource2(name, url, h1Arr, h2Arr, h3Arr) @@ -268,8 +277,6 @@ def buildWashTimes(): #GET SECONDARY HEADLINES h2=content h2s=[] - #only the h1 and the two h2s have this, so split on it and grab - #the second two h2=h2.split('class="top-news', 1)[1] h2=h2.split('', 1)[1] #end of top-news article h2=h2.split('
')[1:] + + for x in h2: + x=x.split('')[0] + if 'og:image' in content: + img=content.split('og:image" content=')[1][1:].split('>')[0] + elif sourceName=='ABC News': + img='https://c1.staticflickr.com/7/6042/6276688407_12900948a2_b.jpgX' if img[-1]=='/': #because the quote separator could be ' or ", #trim to just before it then lop it off -- cgit v1.2.3