diff options
author | ssstvinc2 <sstvinc2@gmail.com> | 2017-03-10 14:30:23 -0500 |
---|---|---|
committer | ssstvinc2 <sstvinc2@gmail.com> | 2017-03-10 14:30:23 -0500 |
commit | 449466e24998f5dedc8476e5391a0f932ebb0ec3 (patch) | |
tree | f6fb84fba16e37ece078930796656f96b9733dc1 | |
parent | 614ed3d18ac8305ff767a1db530f471765ca497c (diff) |
Made removeBadStories modular and reduced crashes by checking for nonetypes
-rwxr-xr-x | parser.py | 91 |
1 files changed, 27 insertions, 64 deletions
@@ -114,77 +114,33 @@ def removalNotification(source, title, reason, value): print('*************************\n\n')
-def removeBadStories(source, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr=None):
-
- arr=[source.h1Arr, source.h2Arr, source.h3Arr]
-
- if badTitleArr!=None:
- for i in range(len(arr)):
- for hed in arr[i]:
- for item in badTitleArr:
- if item in hed.title:
- arr[i].remove(hed)
- #if it's in the h1 slot, bump up the
- # first h2 into the h1 slot
- if i==0:
- arr[0].append(arr[1][0])
- arr[1].remove(arr[1][0])
- removalNotification(source.name, hed.title, 'Title', item)
-
-
- if badDescArr!=None:
+def removeBadStoriesHelper(source, element, badStringList, arr):
+ if badStringList!=None:
for i in range(len(arr)):
for hed in arr[i]:
- for item in badDescArr:
- if item in hed.description:
+ if hed==None:
+ print("////////\nNone type found in removeBadStoriesHelper for "+source+"\n/////////")
+ break
+ for item in badStringList:
+ if item in getattr(hed, element):
arr[i].remove(hed)
#if it's in the h1 slot, bump up the
# first h2 into the h1 slot
if i==0:
arr[0].append(arr[1][0])
arr[1].remove(arr[1][0])
- removalNotification(source.name, hed.title, 'Description', item)
-
+ removalNotification(source.name, hed.title, element, item)
+
+
+def removeBadStories(source, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr=None):
- if badAuthorArr!=None:
- for i in range(len(arr)):
- for hed in arr[i]:
- for item in badAuthorArr:
- if item in hed.author:
- arr[i].remove(hed)
- #if it's in the h1 slot, bump up the
- # first h2 into the h1 slot
- if i==0:
- arr[0].append(arr[1][0])
- arr[1].remove(arr[1][0])
- removalNotification(source.name, hed.title, 'Author', item)
-
+ arr=[source.h1Arr, source.h2Arr, source.h3Arr]
- if badImgArr!=None:
- for i in range(len(arr)):
- for hed in arr[i]:
- for item in badImgArr:
- if item in hed.img:
- arr[i].remove(hed)
- #if it's in the h1 slot, bump up the
- # first h2 into the h1 slot
- if i==0:
- arr[0].append(arr[1][0])
- arr[1].remove(arr[1][0])
- removalNotification(source.name, hed.title, 'Image', item)
-
- if badURLArr!=None:
- for i in range(len(arr)):
- for hed in arr[i]:
- for item in badURLArr:
- if item in hed.url:
- arr[i].remove(hed)
- #if it's in the h1 slot, bump up the
- # first h2 into the h1 slot
- if i==0:
- arr[0].append(arr[1][0])
- arr[1].remove(arr[1][0])
- removalNotification(source.name, hed.title, 'URL', item)
+ removeBadStoriesHelper(source, "title", badTitleArr, arr)
+ removeBadStoriesHelper(source, "description", badDescArr, arr)
+ removeBadStoriesHelper(source, "author", badAuthorArr, arr)
+ removeBadStoriesHelper(source, "img", badImgArr, arr)
+ removeBadStoriesHelper(source, "url", badURLArr, arr)
return source
@@ -345,8 +301,15 @@ def buildBlaze(): blz=buildNewsSource2(name, url, h1s, h2s, h3s)
- blz=removeBadStories(blz, ['Tucker Carlson', 'Mark Levin'], ['Lawrence Jones', 'Mike Slater'], ['Matt Walsh', 'Tomi Lahren', 'Dana Loesch', 'Mike Opelka', 'Chris Salcedo', 'Justin Haskins', 'Sara Gonzales'], None)
+ badTitleArr=['Tucker Carlson', 'Mark Levin']
+ badDescArr=['Lawrence Jones', 'Mike Slater']
+ badAuthorArr=['Matt Walsh', 'Tomi Lahren', 'Dana Loesch', 'Mike Opelka', 'Chris Salcedo', 'Justin Haskins', 'Sara Gonzales', 'Doc Thompson']
+ badImgArr=None
+ badURLArr=None
+ blz=removeBadStories(blz, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr)
+
+
#The Blaze has dumb, short description fields, so we need to grab
#the first x characters of actual article text instead
blz.h1Arr=blazeFixDesc(blz.h1Arr)
@@ -697,8 +660,8 @@ def buildFoxNews(): fox=buildNewsSource2(name, url, h1s, h2s, h3s)
#REMOVE BAD STORIES
- badTitleArr=['O'Reilly', 'Fox News']
- badDescArr=None
+ badTitleArr=['O'Reilly', 'Fox News', 'Brett Baier']
+ badDescArr=['Sean Hannity']
badAuthorArr=['Bill O\'Reilly', 'Sean Hannity']
badImgArr=['http://www.foxnews.com/content/dam/fox-news/logo/og-fn-foxnews.jpg']
badURLArr=['http://www.foxnews.com/opinion', 'videos.foxnews.com']
|