diff options
-rwxr-xr-x | main.py | 4 | ||||
-rwxr-xr-x | parser.py | 124 | ||||
-rwxr-xr-x | spotCheck.py | 4 | ||||
-rw-r--r-- | unbiasedFunctions.py | 5 |
4 files changed, 128 insertions, 9 deletions
@@ -21,9 +21,7 @@ def run(): SOURCES TO ADD NEXT: -ABC -REUTERS - -Christian Science Monitor -Town Hall - -Washington Times ''' @@ -31,7 +29,7 @@ def run(): ### These values have to be the second half of the function name ### E.g. Guardian calls buildGuardian(), etc. sourceFnArr=['Guardian', 'TheHill', 'NPR', 'BBC', 'NBC', 'CBS', - 'FoxNews', 'WashTimes'] #'Blaze' + 'FoxNews', 'WashTimes', 'CSM', 'ABC'] #'Blaze' for source in sourceFnArr: tries=0 @@ -32,19 +32,28 @@ Returns a newsSource2 object '''
def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs):
h1Arr=[]
- h1Arr.append(buildArticle(h1URLs[0], name))
+ a=buildArticle(h1URLs[0], name)
+ if a==None:
+ print('................\nH1 Nonetype in '+name+'\n................')
+ else:
+ h1Arr.append(a)
h2Arr=[]
for x in h2URLs:
a=buildArticle(x, name)
if a!=None:
h2Arr.append(a)
+ else:
+ print('................\nH2 Nonetype in '+name+'\n................')
+
h3Arr=[]
for x in h3URLs:
a=buildArticle(x, name)
if a!=None:
h3Arr.append(a)
+ else:
+ print('................\nH3 Nonetype in '+name+'\n................')
#BUILD THE NEWS SOURCE
newsSource=NewsSource2(name, url, h1Arr, h2Arr, h3Arr)
@@ -268,8 +277,6 @@ def buildWashTimes(): #GET SECONDARY HEADLINES
h2=content
h2s=[]
- #only the h1 and the two h2s have this, so split on it and grab
- #the second two
h2=h2.split('class="top-news', 1)[1]
h2=h2.split('</article>', 1)[1] #end of top-news article
h2=h2.split('<article ', 1)[0] #note the space; we want unclassed articles
@@ -299,6 +306,60 @@ def buildWashTimes(): return wat
+def buildCSM():
+ url='http://www.csmonitor.com'
+ name='Christian Science Monitor'
+
+
+ #DOWNLOAD HOMEPAGE CONTENT
+ content=urlToContent(url)
+
+ #get main headline
+ h1=content
+ h1=h1.split('ui-top-center', 1)[1]
+ h1=h1.split('<a href="', 1)[1]
+ h1=h1.split('"', 1)[0]
+
+ h1s=[url+h1]
+
+ #GET SECONDARY HEADLINES
+ h2=content
+ h2s=[]
+ h2=h2.split('block-3-1', 1)[1]
+ h2=h2.split('ui-top-right', 1)[0]
+ h2=h2.split('<h3 class="story_headline">')[1:]
+
+ for x in h2:
+ x=x.split('<a href="', 1)[1]
+ x=x.split('"', 1)[0]
+ h2s.append(url+x)
+
+ #GET TERTIARY HEADLINES
+ h3=content
+ h3s=[]
+ h3=h3.split('block-2-1', 1)[1]
+ h3=h3.split('block-2-2', 1)[0]
+ h3=h3.split('<h3 class="story_headline')[1:]
+
+ for x in h3:
+ x=x.split('<a href="', 2)[-1]
+ x=x.split('"', 1)[0]
+ h3s.append(url+x)
+
+ h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
+
+ csm=buildNewsSource2(name, url, h1s, h2s, h3s)
+
+ badTitleArr=['Change Agent']
+ badDescArr=None
+ badAuthorArr=None
+ badImgArr=['csm_logo']
+ badURLArr=['difference-maker']
+ csm=removeBadStories(csm, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr)
+
+ return csm
+
+
'''
Function to fix the oddly short og:descriptions provided
@@ -679,6 +740,61 @@ def buildNPR(): +
+def buildABC():
+ url='http://www.abcnews.go.com'
+ name='ABC News'
+
+ #DOWNLOAD HOMEPAGE CONTENT
+ content=urlToContent(url)
+
+ #get main headline
+ h1=content
+ h1=h1.split('id="row-1"', 1)[1]
+ h1=h1.split('<a href="', 1)[1]
+ h1=h1.split('"', 1)[0]
+ h1s=[h1]
+
+ #GET SECONDARY HEADLINES
+ h2=content
+ h2s=[]
+ h2=h2.split('id="row-2"', 1)[1]
+ h2=h2.split('id="row-3"', 1)[0]
+ h2=h2.split('card single row-item')[1:3] #should just be 2 of these
+ for x in h2:
+ x=x.split('<a href="', 1)[1]
+ x=x.split('"', 1)[0]
+ if h1 not in x:
+ h2s.append(x)
+
+ #GET TERTIARY HEADLINES
+ h3=content
+ h3s=[]
+ h3=h3.split('id="row-1"', 1)[1]
+ h3=h3.split('tab-data active', 1)[1]
+ h3=h3.split('tab-data"', 1)[0] #note the trailing quotation
+ while '<a href="' in h3:
+ h3=h3.split('<a href="', 1)[1]
+ x=h3.split('"', 1)[0]
+ if h1 not in x:
+ h3s.append(x)
+
+ h1s, h2s, h3s = removeDuplicates([h1], h2s, h3s)
+ abc=buildNewsSource2(name, url, h1s, h2s, h3s)
+
+ #REMOVE BAD STORIES
+ badTitleArr=None
+ badDescArr=None
+ badAuthorArr=None
+ badImgArr=None
+ badURLArr=None
+ abc=removeBadStories(abc, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr)
+
+ return abc
+
+
+
+
def buildFoxNews():
url='http://foxnews.com'
name='Fox News'
@@ -718,7 +834,7 @@ def buildFoxNews(): fox=buildNewsSource2(name, url, h1s, h2s, h3s)
#REMOVE BAD STORIES
- badTitleArr=['O'Reilly', 'Fox News', 'Brett Baier']
+ badTitleArr=['O'Reilly', 'Fox News', 'Brett Baier', 'Tucker']
badDescArr=['Sean Hannity']
badAuthorArr=['Bill O\'Reilly', 'Sean Hannity']
badImgArr=['http://www.foxnews.com/content/dam/fox-news/logo/og-fn-foxnews.jpg']
diff --git a/spotCheck.py b/spotCheck.py index 7bf46bb..d1edda4 100755 --- a/spotCheck.py +++ b/spotCheck.py @@ -15,7 +15,9 @@ def spotCheck(src): 'blz' : buildBlaze, 'bbc' : buildBBC, 'nbc' : buildNBC, - 'wat' : buildWashTimes} + 'wat' : buildWashTimes, + 'csm' : buildCSM, + 'abc' : buildABC} data=fns[src]() diff --git a/unbiasedFunctions.py b/unbiasedFunctions.py index fca2f2d..1a80d7a 100644 --- a/unbiasedFunctions.py +++ b/unbiasedFunctions.py @@ -41,7 +41,10 @@ def buildArticle(url, sourceName, encoding=None):#, titleDelStart, titleDelEnd, img=img.split('src="', 1)[1].split('"')[0]
else:
- img=content.split('og:image" content=')[1][1:].split('>')[0]
+ if 'og:image' in content:
+ img=content.split('og:image" content=')[1][1:].split('>')[0]
+ elif sourceName=='ABC News':
+ img='https://c1.staticflickr.com/7/6042/6276688407_12900948a2_b.jpgX'
if img[-1]=='/':
#because the quote separator could be ' or ",
#trim to just before it then lop it off
|