diff options
author | ssstvinc2 <sstvinc2@gmail.com> | 2017-03-06 18:12:59 -0500 |
---|---|---|
committer | ssstvinc2 <sstvinc2@gmail.com> | 2017-03-06 18:12:59 -0500 |
commit | f8c6b0084e7d0928121d4c05d3b1f47b10c303c7 (patch) | |
tree | b82e1fc2034ab454a9e3bb38cb08d0a80dbeec17 | |
parent | 0ce27f6e13a139c2fe06082dfb10a35d213fc7a7 (diff) |
Added spotCheck ability. Other minor tweaks
-rwxr-xr-x | main.py | 7 | ||||
-rwxr-xr-x | parser.py | 24 | ||||
-rwxr-xr-x | spotCheck.py | 38 | ||||
-rw-r--r-- | unbiasedFunctions.py | 4 |
4 files changed, 62 insertions, 11 deletions
@@ -26,14 +26,15 @@ def run(): hil=buildTheHill() sourceList.append(hil) - nyt=buildNYT() - sourceList.append(nyt) + #nyt=buildNYT() + #sourceList.append(nyt) npr=buildNPR() sourceList.append(npr) #for some reason, The Guardian sometimes just doesn't work right? #loop until it gets it right + ''' h1='https://www.theguardian.com/us' looped=False while h1=='https://www.theguardian.com/us': @@ -43,6 +44,8 @@ def run(): except: print('The Guardian: build error. Looping again.') looped=True + ''' + gdn=buildGuardian() sourceList.append(gdn) blz=buildBlaze() @@ -226,7 +226,7 @@ def buildTheHill(): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
hil=buildNewsSource2(name, url, h1s, h2s, h3s)
- hil=removeBadStories(hil, ['THE MEMO'], None, ['Matt Schlapp'], None, None)
+ hil=removeBadStories(hil, ['THE MEMO'], None, ['Matt Schlapp', 'Juan Williams'], None, None)
return hil
@@ -235,7 +235,7 @@ def buildTheHill(): def buildGuardian():
- url='http://www.theguardian.com/us'
+ url='http://www.theguardian.com/us-news'
name='The Guardian'
#DOWNLOAD HOMEPAGE CONTENT
@@ -253,7 +253,7 @@ def buildGuardian(): h2s=[]
#only the h1 and the two h2s have this, so split on it and grab
#the second two
- h2=h2.split('<div class="fc-item__image-container u-responsive-ratio inlined-image">', 3)[2:]
+ h2=h2.split('<div class="fc-item__image-container u-responsive-ratio inlined-image">')[2:]
for x in h2:
x=x.split('<h2 class="fc-item__title"><a href="', 1)[1]
x=x.split('"', 1)[0]
@@ -273,7 +273,7 @@ def buildGuardian(): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
gdn=buildNewsSource2(name, url, h1s, h2s, h3s)
- gdn=removeBadStories(gdn, None, ['Tom McCarthy'], ['https://www.theguardian.com/profile/ben-jacobs'], None)
+ gdn=removeBadStories(gdn, None, ['Tom McCarthy', 'Andy Hunter'], ['https://www.theguardian.com/profile/ben-jacobs'], None)
return gdn
@@ -339,7 +339,7 @@ def buildBlaze(): blz=buildNewsSource2(name, url, h1s, h2s, h3s)
- blz=removeBadStories(blz, None, ['Lawrence Jones'], ['Matt Walsh', 'Tomi Lahren', 'Dana Loesch', 'Mike Opelka', 'Chris Salcedo', 'Justin Haskins', 'Sara Gonzales'], None)
+ blz=removeBadStories(blz, None, ['Lawrence Jones', 'Mike Slater'], ['Matt Walsh', 'Tomi Lahren', 'Dana Loesch', 'Mike Opelka', 'Chris Salcedo', 'Justin Haskins', 'Sara Gonzales'], None)
#The Blaze has dumb, short description fields, so we need to grab
#the first x characters of actual article text instead
@@ -401,7 +401,7 @@ def buildCBS(): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
cbs=buildNewsSource2(name, url, h1s, h2s, h3s)
- cbs=removeBadStories(cbs, ['60 Minutes'], None, None, None, None)
+ cbs=removeBadStories(cbs, ['60 Minutes'], None, None, None, ['whats-in-the-news-coverart'])
return cbs
@@ -455,10 +455,12 @@ def buildNBC(): h3s.append(x)
#adjust for today.com urls
+ '''
for arr in [h1s, h2s, h3s]:
for i in range(len(arr)):
if 'today.com' in arr[i]:
arr[i]=arr[i].split('.com', 1)[1]
+ '''
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
nbc=buildNewsSource2(name, url, h1s, h2s, h3s)
@@ -510,7 +512,13 @@ def buildBBC(): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
bbc=buildNewsSource2(name, url, h1s, h2s, h3s)
+ badTitleArr=None
+ badDescArr=None
+ badAuthorArr=None
+ badImgArr=['bbc_news_logo.png']
+ bbc=removeBadStories(bbc, badTitleArr, badDescArr, badAuthorArr, badImgArr)
+
#REMOVE ' - BBC News' from headlines
for i in range(len(bbc.h1Arr)):
if ' - BBC News' in bbc.h1Arr[i].title:
@@ -633,7 +641,7 @@ def buildNPR(): npr=buildNewsSource2(name, url, h1s, h2s, h3s)
#REMOVE BAD STORIES
- badTitleArr=None
+ badTitleArr=['The Two-Way']
badDescArr=None
badAuthorArr=None
badImgArr=None
@@ -683,7 +691,7 @@ def buildFoxNews(): fox=buildNewsSource2(name, url, h1s, h2s, h3s)
#REMOVE BAD STORIES
- badTitleArr=['O'Reilly']
+ badTitleArr=['O'Reilly', 'Fox News']
badDescArr=None
badAuthorArr=['Bill O\'Reilly', 'Sean Hannity']
badImgArr=['http://www.foxnews.com/content/dam/fox-news/logo/og-fn-foxnews.jpg']
diff --git a/spotCheck.py b/spotCheck.py new file mode 100755 index 0000000..5c0e54d --- /dev/null +++ b/spotCheck.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 + + +from parser import * +from unbiasedObjects import * +import sys + +def spotCheck(src): + + fns = {'hil' : buildTheHill, + 'cbs' : buildCBS, + 'npr' : buildNPR, + 'fox' : buildFoxNews, + 'gdn' : buildGuardian, + 'blz' : buildBlaze, + 'bbc' : buildBBC, + 'nbc' : buildNBC} + + data=fns[src]() + + print('H1s:\n--------------') + for h in data.h1Arr: + print(h.title) + + print('\n\nH2s:\n--------------') + for h in data.h2Arr: + print(h.title) + + print('\n\nH3s:\n--------------') + for h in data.h3Arr: + print(h.title) + + print('\n\n') + + + +if __name__=='__main__': + spotCheck(sys.argv[1]) diff --git a/unbiasedFunctions.py b/unbiasedFunctions.py index 444428f..4d2019c 100644 --- a/unbiasedFunctions.py +++ b/unbiasedFunctions.py @@ -96,6 +96,8 @@ def buildArticle(url, sourceName):#, titleDelStart, titleDelEnd, imgDelStart, im print("SHOULDN'T GET HERE")
#strip out self-references
+ description=description.replace(sourceName+"'s", 'our')
+ description=description.replace(sourceName+"'", 'our')
description=description.replace(sourceName, 'our')
if debugging:
@@ -141,7 +143,7 @@ def buildOutput(newsSourceArr): if not pair in h3RandomPairs:
h3RandomPairs.append(pair)
else:
- continue
+ print('\n\n@@@@\nNo H3 stories in '+newsSourceArr[x].name+'\n@@@@\n\n')
#replace html template locations with data from newsSourceArr
for i in range(len(h1RandomSources)):
|