From 064e200fa57dfeda3776b7a708cc0af8280f20fb Mon Sep 17 00:00:00 2001 From: sstvinc2 Date: Wed, 15 Feb 2017 09:24:26 -0600 Subject: Added author name to Article class; added removeBadArticles() filter function --- parser.py | 60 ++++++++++++++++++++++++++++++++++++++++++++++------ unbiasedFunctions.py | 12 +++++++++-- unbiasedObjects.py | 6 ++++-- 3 files changed, 67 insertions(+), 11 deletions(-) diff --git a/parser.py b/parser.py index 93ed020..0bd5b0f 100644 --- a/parser.py +++ b/parser.py @@ -92,6 +92,56 @@ def removeDuplicates(h1s, h2s, h3s): return h1s, h2s, h3s + + + +def removeBadStories(source, badDescArr, badAuthorArr): + + if badAuthorArr!=None: + for h1 in source.h1Arr: + for item in badAuthorArr: + if item in h1.author: + source.h1Arr.remove(h1) + #if it's in the h1 slot, bump up the first h2 into the h1 slot + source.h1Arr.append(source.h2Arr[0]) + source.h2Arr.remove(source.h2Arr[0]) + print('removed '+h1.title+' from '+source.name) + for h2 in source.h2Arr: + for item in badAuthorArr: + if item in h2.author: + source.h2Arr.remove(h2) + print('removed '+h2.title+' from '+source.name) + + for h3 in source.h3Arr: + for item in badAuthorArr: + if item in h3.author: + source.h3Arr.remove(h3) + print('removed '+h3.title+' from '+source.name) + + ''' + if badDescArr!=None: + for h1 in source.h1Arr: + for item in badDescArr: + if item in h1.description: + source.h1Arr.remove(h1) + #if it's in the h1 slot, bump up the first h2 into the h1 slot + source.h1Arr.append(source.h2Arr[0]) + source.h2Arr.remove(source.h2Arr[0]) + print('removed '+h1.title+' from '+source.name) + for h2 in source.h2Arr: + for item in badDescArr: + if item in h2.description: + source.h2Arr.remove(h2) + print('removed '+h2.title+' from '+source.name) + + for h3 in source.h3Arr: + for item in badDescArr: + if item in h3.description: + source.h3Arr.remove(h3) + print('removed '+h3.title+' from '+source.name) + ''' + + return source @@ -148,13 +198,9 @@ def buildWeeklyStandard(): wkl=buildNewsSource2(name, url, h1s, h2s, h3s) #REMOVE BAD STORIES - #if it's in the h1 slot, bump up the first h2 into the h1 slot - for h1 in wkl.h1Arr: - if 'Matt Labash' in h1.description: - wkl.h1Arr.remove(h1) - wkl.h1Arr.append(wkl.h2Arr[0]) - wkl.h2Arr.remove(wkl.h2Arr[0]) - print('removed '+h1.title) + badDescArr=['Matt Labash'] + badAuthorArr=['MATT LABASH'] + wkl=removeBadStories(wkl, badDescArr, badAuthorArr) return wkl diff --git a/unbiasedFunctions.py b/unbiasedFunctions.py index ef6ae7c..5f46ed2 100644 --- a/unbiasedFunctions.py +++ b/unbiasedFunctions.py @@ -33,16 +33,24 @@ def buildArticle(url, sourceName):#, titleDelStart, titleDelEnd, imgDelStart, im title=title[:-1].strip() title=title[:-1] + author='' + authorTags=['article:author', 'dc.creator'] + for tag in authorTags: + if tag in content: + author=content.split(tag+'" content=')[1][1:].split('>')[0] + author=author[:-1] + break + description=content.split('og:description" content=')[1][1:].split('>')[0] if description[-1]=='/': description=description[:-1].strip() description=description[:-1] - a=Article(title, url, img, description, sourceName) + a=Article(title, url, img, description, sourceName, author) return a except: - print("Article parsing error in buildArticle() for URL: "+url) + print("Article parsing error in buildArticle() for URL: "+url+" in source"+sourceName) return None diff --git a/unbiasedObjects.py b/unbiasedObjects.py index 2233b0c..3affbe6 100644 --- a/unbiasedObjects.py +++ b/unbiasedObjects.py @@ -4,16 +4,18 @@ class Article(): img='' description='' source='' + author='' - def __init__(self, title, url, img, description, source): + def __init__(self, title, url, img, description, source, author): self.title=title self.url=url self.img=img self.description=description self.source=source + self.author=author def __str__(self): - return '-----------\n'+self.title+'\n'+self.source+'\n'+self.description+'\n'+self.url+'\n'+self.img+'\n'+'-----------' + return '-----------\n'+self.title+'\n'+self.author+'\n'+self.source+'\n'+self.description+'\n'+self.url+'\n'+self.img+'\n'+'-----------' class NewsSource2(): -- cgit v1.2.3