diff options
-rwxr-xr-x | main.py | 63 | ||||
-rwxr-xr-x | parser.py | 39 |
2 files changed, 44 insertions, 58 deletions
@@ -5,6 +5,7 @@ from unbiasedFunctions import * from parser import * import time + def main(): while True: print('-----------------------') @@ -23,49 +24,27 @@ def run(): ''' - #for some reason, The Guardian sometimes just doesn't work right? - #loop until it gets it right - ''' - h1='https://www.theguardian.com/us' - looped=False - while h1=='https://www.theguardian.com/us': - try: - gdn=buildGuardian() - h1=gdn.h1Arr[0] - except: - print('The Guardian: build error. Looping again.') - looped=True - ''' - gdn=buildGuardian() - sourceList.append(gdn) - - hil=buildTheHill() - sourceList.append(hil) - - #nyt=buildNYT() - #sourceList.append(nyt) - - npr=buildNPR() - sourceList.append(npr) - blz=buildBlaze() - sourceList.append(blz) - - bbc=buildBBC() - sourceList.append(bbc) - - nbc=buildNBC() - sourceList.append(nbc) - - cbs=buildCBS() - sourceList.append(cbs) - - #Weekly standard just doesn't update frequently enough - #wkl=buildWeeklyStandard() - #sourceList.append(wkl) - - fox=buildFoxNews() - sourceList.append(fox) + ### These values have to be the second half of the function name + ### E.g. Guardian calls buildGuardian(), etc. + sourceFnArr=['Guardian', 'TheHill', 'NPR', 'Blaze', 'BBC', 'NBC', 'CBS', + 'FoxNews', ] + + for source in sourceFnArr: + tries=0 + while tries<3: + try: + fn='build'+source + possibles = globals().copy() + possibles.update(locals()) + method = possibles.get(fn) + src=method() + sourceList.append(src) + break + except: + print('Build error. Looping again: '+source) + tries+=1 + time.sleep(tries) #scrape all urls and build data structure newsSourceArr=buildNewsSourceArr(sourceList) @@ -119,7 +119,7 @@ def removeBadStoriesHelper(source, element, badStringList, arr): for i in range(len(arr)):
for hed in arr[i]:
if hed==None:
- print("////////\nNone type found in removeBadStoriesHelper for "+source+"\n/////////")
+ print("////////\nNone type found in removeBadStoriesHelper for "+source.name+"\n/////////")
break
for item in badStringList:
if item in getattr(hed, element):
@@ -197,14 +197,23 @@ def buildGuardian(): url='http://www.theguardian.com/us'
name='The Guardian US'
- #DOWNLOAD HOMEPAGE CONTENT
- content=urlToContent(url, 'utf8')
-
- #get main headline
- h1=content
- h1=h1.split('<h1', 1)[1]
- h1=h1.split('<a href="', 1)[1]
- h1=h1.split('"', 1)[0]
+
+ while True:
+ #DOWNLOAD HOMEPAGE CONTENT
+ content=urlToContent(url, 'utf8')
+
+ #get main headline
+ h1=content
+ h1=h1.split('<h1', 1)[1]
+ h1=h1.split('<a href="', 1)[1]
+ h1=h1.split('"', 1)[0]
+
+ print(h1)
+ if h1!='https://www.theguardian.com/us':
+ break
+ else:
+ print('Guardian loop')
+
h1s=[h1]
#GET SECONDARY HEADLINES
@@ -233,7 +242,7 @@ def buildGuardian(): h3s.append(x)
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
-
+
gdn=buildNewsSource2(name, url, h1s, h2s, h3s)
gdn=removeBadStories(gdn, None, ['Tom McCarthy', 'Andy Hunter'], ['https://www.theguardian.com/profile/ben-jacobs'], None)
@@ -269,7 +278,6 @@ def buildBlaze(): #get main headline
h1=content
h1=h1.split('<!-- home -->', 1)[1]
- h1=h1.split('<!-- loop-home -->', 1)[0]
h1=h1.split('<a class="gallery-link" href="', 1)[1]
h1=h1.split('"', 1)[0]
h1s=[url+h1]
@@ -279,9 +287,9 @@ def buildBlaze(): h2s=[]
h2=h2.split('<!-- home -->', 1)[1]
h2=h2.split('<!-- loop-home -->', 1)[0]
- while '</figure>\n\n<figure class="gallery-item">' in h2:
- h2=h2.split('</figure>\n\n<figure class="gallery-item">', 1)[1]
- h2=h2.split('href="', 1)[1]
+ while '<a class="gallery-link" href="' in h2:#'</figure>\n\n<figure class="gallery-item">' in h2:
+ h2=h2.split('<a class="gallery-link" href="', 1)[1]#'</figure>\n\n<figure class="gallery-item">', 1)[1]
+ #h2=h2.split('href="', 1)[1]
x=h2.split('"', 1)[0]
if h1 not in x:
h2s.append(url+x)
@@ -299,12 +307,11 @@ def buildBlaze(): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
-
blz=buildNewsSource2(name, url, h1s, h2s, h3s)
badTitleArr=['Tucker Carlson', 'Mark Levin']
badDescArr=['Lawrence Jones', 'Mike Slater']
- badAuthorArr=['Matt Walsh', 'Tomi Lahren', 'Dana Loesch', 'Mike Opelka', 'Chris Salcedo', 'Justin Haskins', 'Sara Gonzales', 'Doc Thompson']
+ badAuthorArr=['Matt Walsh', 'Tomi Lahren', 'Dana Loesch', 'Mike Opelka', 'Chris Salcedo', 'Justin Haskins', 'Sara Gonzales', 'Doc Thompson', 'Glenn Beck']
badImgArr=None
badURLArr=None
blz=removeBadStories(blz, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr)
|