summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xmain.py63
-rwxr-xr-xparser.py39
2 files changed, 44 insertions, 58 deletions
diff --git a/main.py b/main.py
index 182ae26..c54487e 100755
--- a/main.py
+++ b/main.py
@@ -5,6 +5,7 @@ from unbiasedFunctions import *
from parser import *
import time
+
def main():
while True:
print('-----------------------')
@@ -23,49 +24,27 @@ def run():
'''
- #for some reason, The Guardian sometimes just doesn't work right?
- #loop until it gets it right
- '''
- h1='https://www.theguardian.com/us'
- looped=False
- while h1=='https://www.theguardian.com/us':
- try:
- gdn=buildGuardian()
- h1=gdn.h1Arr[0]
- except:
- print('The Guardian: build error. Looping again.')
- looped=True
- '''
- gdn=buildGuardian()
- sourceList.append(gdn)
-
- hil=buildTheHill()
- sourceList.append(hil)
-
- #nyt=buildNYT()
- #sourceList.append(nyt)
-
- npr=buildNPR()
- sourceList.append(npr)
- blz=buildBlaze()
- sourceList.append(blz)
-
- bbc=buildBBC()
- sourceList.append(bbc)
-
- nbc=buildNBC()
- sourceList.append(nbc)
-
- cbs=buildCBS()
- sourceList.append(cbs)
-
- #Weekly standard just doesn't update frequently enough
- #wkl=buildWeeklyStandard()
- #sourceList.append(wkl)
-
- fox=buildFoxNews()
- sourceList.append(fox)
+ ### These values have to be the second half of the function name
+ ### E.g. Guardian calls buildGuardian(), etc.
+ sourceFnArr=['Guardian', 'TheHill', 'NPR', 'Blaze', 'BBC', 'NBC', 'CBS',
+ 'FoxNews', ]
+
+ for source in sourceFnArr:
+ tries=0
+ while tries<3:
+ try:
+ fn='build'+source
+ possibles = globals().copy()
+ possibles.update(locals())
+ method = possibles.get(fn)
+ src=method()
+ sourceList.append(src)
+ break
+ except:
+ print('Build error. Looping again: '+source)
+ tries+=1
+ time.sleep(tries)
#scrape all urls and build data structure
newsSourceArr=buildNewsSourceArr(sourceList)
diff --git a/parser.py b/parser.py
index 21f0669..671e2e5 100755
--- a/parser.py
+++ b/parser.py
@@ -119,7 +119,7 @@ def removeBadStoriesHelper(source, element, badStringList, arr):
for i in range(len(arr)):
for hed in arr[i]:
if hed==None:
- print("////////\nNone type found in removeBadStoriesHelper for "+source+"\n/////////")
+ print("////////\nNone type found in removeBadStoriesHelper for "+source.name+"\n/////////")
break
for item in badStringList:
if item in getattr(hed, element):
@@ -197,14 +197,23 @@ def buildGuardian():
url='http://www.theguardian.com/us'
name='The Guardian US'
- #DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url, 'utf8')
-
- #get main headline
- h1=content
- h1=h1.split('<h1', 1)[1]
- h1=h1.split('<a href="', 1)[1]
- h1=h1.split('"', 1)[0]
+
+ while True:
+ #DOWNLOAD HOMEPAGE CONTENT
+ content=urlToContent(url, 'utf8')
+
+ #get main headline
+ h1=content
+ h1=h1.split('<h1', 1)[1]
+ h1=h1.split('<a href="', 1)[1]
+ h1=h1.split('"', 1)[0]
+
+ print(h1)
+ if h1!='https://www.theguardian.com/us':
+ break
+ else:
+ print('Guardian loop')
+
h1s=[h1]
#GET SECONDARY HEADLINES
@@ -233,7 +242,7 @@ def buildGuardian():
h3s.append(x)
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
-
+
gdn=buildNewsSource2(name, url, h1s, h2s, h3s)
gdn=removeBadStories(gdn, None, ['Tom McCarthy', 'Andy Hunter'], ['https://www.theguardian.com/profile/ben-jacobs'], None)
@@ -269,7 +278,6 @@ def buildBlaze():
#get main headline
h1=content
h1=h1.split('<!-- home -->', 1)[1]
- h1=h1.split('<!-- loop-home -->', 1)[0]
h1=h1.split('<a class="gallery-link" href="', 1)[1]
h1=h1.split('"', 1)[0]
h1s=[url+h1]
@@ -279,9 +287,9 @@ def buildBlaze():
h2s=[]
h2=h2.split('<!-- home -->', 1)[1]
h2=h2.split('<!-- loop-home -->', 1)[0]
- while '</figure>\n\n<figure class="gallery-item">' in h2:
- h2=h2.split('</figure>\n\n<figure class="gallery-item">', 1)[1]
- h2=h2.split('href="', 1)[1]
+ while '<a class="gallery-link" href="' in h2:#'</figure>\n\n<figure class="gallery-item">' in h2:
+ h2=h2.split('<a class="gallery-link" href="', 1)[1]#'</figure>\n\n<figure class="gallery-item">', 1)[1]
+ #h2=h2.split('href="', 1)[1]
x=h2.split('"', 1)[0]
if h1 not in x:
h2s.append(url+x)
@@ -299,12 +307,11 @@ def buildBlaze():
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
-
blz=buildNewsSource2(name, url, h1s, h2s, h3s)
badTitleArr=['Tucker Carlson', 'Mark Levin']
badDescArr=['Lawrence Jones', 'Mike Slater']
- badAuthorArr=['Matt Walsh', 'Tomi Lahren', 'Dana Loesch', 'Mike Opelka', 'Chris Salcedo', 'Justin Haskins', 'Sara Gonzales', 'Doc Thompson']
+ badAuthorArr=['Matt Walsh', 'Tomi Lahren', 'Dana Loesch', 'Mike Opelka', 'Chris Salcedo', 'Justin Haskins', 'Sara Gonzales', 'Doc Thompson', 'Glenn Beck']
badImgArr=None
badURLArr=None
blz=removeBadStories(blz, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr)