summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorsstvinc2 <sstvinc2@gmail.com>2017-02-15 09:24:26 -0600
committersstvinc2 <sstvinc2@gmail.com>2017-02-15 09:24:26 -0600
commit064e200fa57dfeda3776b7a708cc0af8280f20fb (patch)
treee52415d4e0f329c0283709c9e44e2ea93829721e
parent511b3ba3f9de0d38e861833d6bcd7160487af111 (diff)
Added author name to Article class; added removeBadArticles() filter function
-rw-r--r--parser.py60
-rw-r--r--unbiasedFunctions.py12
-rw-r--r--unbiasedObjects.py6
3 files changed, 67 insertions, 11 deletions
diff --git a/parser.py b/parser.py
index 93ed020..0bd5b0f 100644
--- a/parser.py
+++ b/parser.py
@@ -92,6 +92,56 @@ def removeDuplicates(h1s, h2s, h3s):
return h1s, h2s, h3s
+
+
+
+def removeBadStories(source, badDescArr, badAuthorArr):
+
+ if badAuthorArr!=None:
+ for h1 in source.h1Arr:
+ for item in badAuthorArr:
+ if item in h1.author:
+ source.h1Arr.remove(h1)
+ #if it's in the h1 slot, bump up the first h2 into the h1 slot
+ source.h1Arr.append(source.h2Arr[0])
+ source.h2Arr.remove(source.h2Arr[0])
+ print('removed '+h1.title+' from '+source.name)
+ for h2 in source.h2Arr:
+ for item in badAuthorArr:
+ if item in h2.author:
+ source.h2Arr.remove(h2)
+ print('removed '+h2.title+' from '+source.name)
+
+ for h3 in source.h3Arr:
+ for item in badAuthorArr:
+ if item in h3.author:
+ source.h3Arr.remove(h3)
+ print('removed '+h3.title+' from '+source.name)
+
+ '''
+ if badDescArr!=None:
+ for h1 in source.h1Arr:
+ for item in badDescArr:
+ if item in h1.description:
+ source.h1Arr.remove(h1)
+ #if it's in the h1 slot, bump up the first h2 into the h1 slot
+ source.h1Arr.append(source.h2Arr[0])
+ source.h2Arr.remove(source.h2Arr[0])
+ print('removed '+h1.title+' from '+source.name)
+ for h2 in source.h2Arr:
+ for item in badDescArr:
+ if item in h2.description:
+ source.h2Arr.remove(h2)
+ print('removed '+h2.title+' from '+source.name)
+
+ for h3 in source.h3Arr:
+ for item in badDescArr:
+ if item in h3.description:
+ source.h3Arr.remove(h3)
+ print('removed '+h3.title+' from '+source.name)
+ '''
+
+ return source
@@ -148,13 +198,9 @@ def buildWeeklyStandard():
wkl=buildNewsSource2(name, url, h1s, h2s, h3s)
#REMOVE BAD STORIES
- #if it's in the h1 slot, bump up the first h2 into the h1 slot
- for h1 in wkl.h1Arr:
- if 'Matt Labash' in h1.description:
- wkl.h1Arr.remove(h1)
- wkl.h1Arr.append(wkl.h2Arr[0])
- wkl.h2Arr.remove(wkl.h2Arr[0])
- print('removed '+h1.title)
+ badDescArr=['Matt Labash']
+ badAuthorArr=['MATT LABASH']
+ wkl=removeBadStories(wkl, badDescArr, badAuthorArr)
return wkl
diff --git a/unbiasedFunctions.py b/unbiasedFunctions.py
index ef6ae7c..5f46ed2 100644
--- a/unbiasedFunctions.py
+++ b/unbiasedFunctions.py
@@ -33,16 +33,24 @@ def buildArticle(url, sourceName):#, titleDelStart, titleDelEnd, imgDelStart, im
title=title[:-1].strip()
title=title[:-1]
+ author=''
+ authorTags=['article:author', 'dc.creator']
+ for tag in authorTags:
+ if tag in content:
+ author=content.split(tag+'" content=')[1][1:].split('>')[0]
+ author=author[:-1]
+ break
+
description=content.split('og:description" content=')[1][1:].split('>')[0]
if description[-1]=='/':
description=description[:-1].strip()
description=description[:-1]
- a=Article(title, url, img, description, sourceName)
+ a=Article(title, url, img, description, sourceName, author)
return a
except:
- print("Article parsing error in buildArticle() for URL: "+url)
+ print("Article parsing error in buildArticle() for URL: "+url+" in source"+sourceName)
return None
diff --git a/unbiasedObjects.py b/unbiasedObjects.py
index 2233b0c..3affbe6 100644
--- a/unbiasedObjects.py
+++ b/unbiasedObjects.py
@@ -4,16 +4,18 @@ class Article():
img=''
description=''
source=''
+ author=''
- def __init__(self, title, url, img, description, source):
+ def __init__(self, title, url, img, description, source, author):
self.title=title
self.url=url
self.img=img
self.description=description
self.source=source
+ self.author=author
def __str__(self):
- return '-----------\n'+self.title+'\n'+self.source+'\n'+self.description+'\n'+self.url+'\n'+self.img+'\n'+'-----------'
+ return '-----------\n'+self.title+'\n'+self.author+'\n'+self.source+'\n'+self.description+'\n'+self.url+'\n'+self.img+'\n'+'-----------'
class NewsSource2():