summaryrefslogtreecommitdiff
path: root/parser.py
diff options
context:
space:
mode:
authorssstvinc2 <sstvinc2@gmail.com>2017-03-10 14:30:23 -0500
committerssstvinc2 <sstvinc2@gmail.com>2017-03-10 14:30:23 -0500
commit449466e24998f5dedc8476e5391a0f932ebb0ec3 (patch)
treef6fb84fba16e37ece078930796656f96b9733dc1 /parser.py
parent614ed3d18ac8305ff767a1db530f471765ca497c (diff)
Made removeBadStories modular and reduced crashes by checking for nonetypes
Diffstat (limited to 'parser.py')
-rwxr-xr-xparser.py91
1 files changed, 27 insertions, 64 deletions
diff --git a/parser.py b/parser.py
index a54f033..21f0669 100755
--- a/parser.py
+++ b/parser.py
@@ -114,77 +114,33 @@ def removalNotification(source, title, reason, value):
print('*************************\n\n')
-def removeBadStories(source, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr=None):
-
- arr=[source.h1Arr, source.h2Arr, source.h3Arr]
-
- if badTitleArr!=None:
- for i in range(len(arr)):
- for hed in arr[i]:
- for item in badTitleArr:
- if item in hed.title:
- arr[i].remove(hed)
- #if it's in the h1 slot, bump up the
- # first h2 into the h1 slot
- if i==0:
- arr[0].append(arr[1][0])
- arr[1].remove(arr[1][0])
- removalNotification(source.name, hed.title, 'Title', item)
-
-
- if badDescArr!=None:
+def removeBadStoriesHelper(source, element, badStringList, arr):
+ if badStringList!=None:
for i in range(len(arr)):
for hed in arr[i]:
- for item in badDescArr:
- if item in hed.description:
+ if hed==None:
+ print("////////\nNone type found in removeBadStoriesHelper for "+source+"\n/////////")
+ break
+ for item in badStringList:
+ if item in getattr(hed, element):
arr[i].remove(hed)
#if it's in the h1 slot, bump up the
# first h2 into the h1 slot
if i==0:
arr[0].append(arr[1][0])
arr[1].remove(arr[1][0])
- removalNotification(source.name, hed.title, 'Description', item)
-
+ removalNotification(source.name, hed.title, element, item)
+
+
+def removeBadStories(source, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr=None):
- if badAuthorArr!=None:
- for i in range(len(arr)):
- for hed in arr[i]:
- for item in badAuthorArr:
- if item in hed.author:
- arr[i].remove(hed)
- #if it's in the h1 slot, bump up the
- # first h2 into the h1 slot
- if i==0:
- arr[0].append(arr[1][0])
- arr[1].remove(arr[1][0])
- removalNotification(source.name, hed.title, 'Author', item)
-
+ arr=[source.h1Arr, source.h2Arr, source.h3Arr]
- if badImgArr!=None:
- for i in range(len(arr)):
- for hed in arr[i]:
- for item in badImgArr:
- if item in hed.img:
- arr[i].remove(hed)
- #if it's in the h1 slot, bump up the
- # first h2 into the h1 slot
- if i==0:
- arr[0].append(arr[1][0])
- arr[1].remove(arr[1][0])
- removalNotification(source.name, hed.title, 'Image', item)
-
- if badURLArr!=None:
- for i in range(len(arr)):
- for hed in arr[i]:
- for item in badURLArr:
- if item in hed.url:
- arr[i].remove(hed)
- #if it's in the h1 slot, bump up the
- # first h2 into the h1 slot
- if i==0:
- arr[0].append(arr[1][0])
- arr[1].remove(arr[1][0])
- removalNotification(source.name, hed.title, 'URL', item)
+ removeBadStoriesHelper(source, "title", badTitleArr, arr)
+ removeBadStoriesHelper(source, "description", badDescArr, arr)
+ removeBadStoriesHelper(source, "author", badAuthorArr, arr)
+ removeBadStoriesHelper(source, "img", badImgArr, arr)
+ removeBadStoriesHelper(source, "url", badURLArr, arr)
return source
@@ -345,8 +301,15 @@ def buildBlaze():
blz=buildNewsSource2(name, url, h1s, h2s, h3s)
- blz=removeBadStories(blz, ['Tucker Carlson', 'Mark Levin'], ['Lawrence Jones', 'Mike Slater'], ['Matt Walsh', 'Tomi Lahren', 'Dana Loesch', 'Mike Opelka', 'Chris Salcedo', 'Justin Haskins', 'Sara Gonzales'], None)
+ badTitleArr=['Tucker Carlson', 'Mark Levin']
+ badDescArr=['Lawrence Jones', 'Mike Slater']
+ badAuthorArr=['Matt Walsh', 'Tomi Lahren', 'Dana Loesch', 'Mike Opelka', 'Chris Salcedo', 'Justin Haskins', 'Sara Gonzales', 'Doc Thompson']
+ badImgArr=None
+ badURLArr=None
+ blz=removeBadStories(blz, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr)
+
+
#The Blaze has dumb, short description fields, so we need to grab
#the first x characters of actual article text instead
blz.h1Arr=blazeFixDesc(blz.h1Arr)
@@ -697,8 +660,8 @@ def buildFoxNews():
fox=buildNewsSource2(name, url, h1s, h2s, h3s)
#REMOVE BAD STORIES
- badTitleArr=['O&#039;Reilly', 'Fox News']
- badDescArr=None
+ badTitleArr=['O&#039;Reilly', 'Fox News', 'Brett Baier']
+ badDescArr=['Sean Hannity']
badAuthorArr=['Bill O\'Reilly', 'Sean Hannity']
badImgArr=['http://www.foxnews.com/content/dam/fox-news/logo/og-fn-foxnews.jpg']
badURLArr=['http://www.foxnews.com/opinion', 'videos.foxnews.com']