summaryrefslogtreecommitdiff
path: root/parser.py
diff options
context:
space:
mode:
authorsstvinc2 <sstvinc2@gmail.com>2017-02-16 21:46:58 -0600
committersstvinc2 <sstvinc2@gmail.com>2017-02-16 21:46:58 -0600
commita627f07467fd60978af7c51fb8ba3d801fcbcafe (patch)
tree6693591beb750d24805a29430fc5b2ad3cbaf8b6 /parser.py
parent1b08ad4652091d529588f9fb75f7412a07d2dd28 (diff)
Added in title checks for article removal
Diffstat (limited to 'parser.py')
-rw-r--r--parser.py126
1 files changed, 60 insertions, 66 deletions
diff --git a/parser.py b/parser.py
index 41972cd..ea318ca 100644
--- a/parser.py
+++ b/parser.py
@@ -100,73 +100,65 @@ def removeDuplicates(h1s, h2s, h3s):
-def removeBadStories(source, badDescArr, badAuthorArr, badImgArr):
+def removeBadStories(source, badTitleArr, badDescArr, badAuthorArr, badImgArr):
- if badAuthorArr!=None:
- for h1 in source.h1Arr:
- for item in badAuthorArr:
- if item in h1.author:
- source.h1Arr.remove(h1)
- #if it's in the h1 slot, bump up the first h2 into the h1 slot
- source.h1Arr.append(source.h2Arr[0])
- source.h2Arr.remove(source.h2Arr[0])
- print('removed '+h1.title+' from '+source.name+' Reason: bad author')
-
- for h2 in source.h2Arr:
- for item in badAuthorArr:
- if item in h2.author:
- source.h2Arr.remove(h2)
- print('removed '+h2.title+' from '+source.name+' Reason: bad author')
-
- for h3 in source.h3Arr:
- for item in badAuthorArr:
- if item in h3.author:
- source.h3Arr.remove(h3)
- print('removed '+h3.title+' from '+source.name+' Reason: bad author')
+ arr=[source.h1Arr, source.h2Arr, source.h3Arr]
+
+ if badTitleArr!=None:
+ for i in range(len(arr)):
+ for hed in arr[i]:
+ for item in badTitleArr:
+ if item in hed.title:
+ arr[i].remove(hed)
+ #if it's in the h1 slot, bump up the
+ # first h2 into the h1 slot
+ if i==0:
+ arr[0].append(arr[1][0])
+ arr[1].remove(arr[1][0])
+ print('Removed:\n'+source.name+'\n'+hed.title+' from '+source.name+'\nReason: Title ('+item+')\n')
+
if badDescArr!=None:
- for h1 in source.h1Arr:
- for item in badDescArr:
- if item in h1.description:
- source.h1Arr.remove(h1)
- #if it's in the h1 slot, bump up the first h2 into the h1 slot
- source.h1Arr.append(source.h2Arr[0])
- source.h2Arr.remove(source.h2Arr[0])
- print('removed '+h1.title+' from '+source.name+' Reason: bad description')
- for h2 in source.h2Arr:
- for item in badDescArr:
- if item in h2.description:
- source.h2Arr.remove(h2)
- print('removed '+h2.title+' from '+source.name+' Reason: bad description')
-
- for h3 in source.h3Arr:
- for item in badDescArr:
- if item in h3.description:
- source.h3Arr.remove(h3)
- print('removed '+h3.title+' from '+source.name+' Reason: bad description')
+ for i in range(len(arr)):
+ for hed in arr[i]:
+ for item in badDescArr:
+ if item in hed.description:
+ arr[i].remove(hed)
+ #if it's in the h1 slot, bump up the
+ # first h2 into the h1 slot
+ if i==0:
+ arr[0].append(arr[1][0])
+ arr[1].remove(arr[1][0])
+ print('Removed:\n'+source.name+'\n'+hed.title+' from '+source.name+'\nReason: Description ('+item+')\n')
+
- if badImgArr!=None:
- for h1 in source.h1Arr:
- for item in badImgArr:
- if item in h1.img:
- source.h1Arr.remove(h1)
- #if it's in the h1 slot, bump up the first h2 into the h1 slot
- source.h1Arr.append(source.h2Arr[0])
- source.h2Arr.remove(source.h2Arr[0])
- print('removed '+h1.title+' from '+source.name+' Reason: bad image')
-
- for h2 in source.h2Arr:
- for item in badImgArr:
- if item in h2.img:
- source.h2Arr.remove(h2)
- print('removed '+h2.title+' from '+source.name+' Reason: bad image')
-
- for h3 in source.h3Arr:
- for item in badImgArr:
- if item in h3.img:
- source.h3Arr.remove(h3)
- print('removed '+h3.title+' from '+source.name+' Reason: bad image')
+ if badAuthorArr!=None:
+ for i in range(len(arr)):
+ for hed in arr[i]:
+ for item in badAuthorArr:
+ if item in hed.author:
+ arr[i].remove(hed)
+ #if it's in the h1 slot, bump up the
+ # first h2 into the h1 slot
+ if i==0:
+ arr[0].append(arr[1][0])
+ arr[1].remove(arr[1][0])
+ print('Removed:\n'+source.name+'\n'+hed.title+' from '+source.name+'\nReason: Author ('+item+')\n')
+
+ if badImgArr!=None:
+ for i in range(len(arr)):
+ for hed in arr[i]:
+ for item in badImgArr:
+ if item in hed.img:
+ arr[i].remove(hed)
+ #if it's in the h1 slot, bump up the
+ # first h2 into the h1 slot
+ if i==0:
+ arr[0].append(arr[1][0])
+ arr[1].remove(arr[1][0])
+ print('Removed:\n'+source.name+'\n'+hed.title+' from '+source.name+'\nReason: Image ('+item+')\n')
+
return source
@@ -210,7 +202,7 @@ def buildGuardian():
h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
gdn=buildNewsSource2(name, url, h1s, h2s, h3s)
- gdn=removeBadStories(gdn, ['Tom McCarthy'], ['https://www.theguardian.com/profile/ben-jacobs'], None)
+ gdn=removeBadStories(gdn, None, ['Tom McCarthy'], ['https://www.theguardian.com/profile/ben-jacobs'], None)
return gdn
@@ -276,7 +268,7 @@ def buildBlaze():
blz=buildNewsSource2(name, url, h1s, h2s, h3s)
- blz=removeBadStories(blz, None, ['Matt Walsh', 'Tomi Lahren', 'Dana Loesch', 'Mike Opelka'], None)
+ blz=removeBadStories(blz, None, None, ['Matt Walsh', 'Tomi Lahren', 'Dana Loesch', 'Mike Opelka'], None)
#The Blaze has dumb, short description fields, so we need to grab
#the first x characters of actual article text instead
@@ -513,11 +505,12 @@ def buildWeeklyStandard():
wkl=buildNewsSource2(name, url, h1s, h2s, h3s)
#REMOVE BAD STORIES
+ badTitleArr=None
## if flagged again, remove Micah Mattix
badDescArr=['Matt Labash']
badAuthorArr=['MATT LABASH', 'TWS PODCAST', 'ERIC FELTEN']
badImgArr=['http://www.weeklystandard.com/s3/tws15/images/twitter/tws-twitter_1024x512.png']
- wkl=removeBadStories(wkl, badDescArr, badAuthorArr, badImgArr)
+ wkl=removeBadStories(wkl, badTitleArr, badDescArr, badAuthorArr, badImgArr)
return wkl
@@ -563,10 +556,11 @@ def buildFoxNews():
fox=buildNewsSource2(name, url, h1s, h2s, h3s)
#REMOVE BAD STORIES
+ badTitleArr=['O&#039;Reilly']
badDescArr=None
badAuthorArr=['Bill O\'Reilly', 'Sean Hannity']
badImgArr=['http://www.foxnews.com/content/dam/fox-news/logo/og-fn-foxnews.jpg']
- fox=removeBadStories(fox, badDescArr, badAuthorArr, badImgArr)
+ fox=removeBadStories(fox, badTitleArr, badDescArr, badAuthorArr, badImgArr)
return fox