From d1c7dfc9c2a47edf80527c2457481b9508087ce6 Mon Sep 17 00:00:00 2001 From: sstvinc2 Date: Sat, 18 Feb 2017 22:23:51 -0600 Subject: Added The Hill; also tweaked buildArticle() --- main.py | 9 +++++--- parser.py | 49 ++++++++++++++++++++++++++++++++++++++++- unbiasedFunctions.py | 61 +++++++++++++++++++++++++++++++++++++--------------- 3 files changed, 98 insertions(+), 21 deletions(-) diff --git a/main.py b/main.py index 8318da0..f381db5 100644 --- a/main.py +++ b/main.py @@ -20,10 +20,12 @@ def run(): SOURCES TO ADD NEXT: -ABC -REUTERS - -THE HILL ''' + hil=buildTheHill() + sourceList.append(hil) + nyt=buildNYT() sourceList.append(nyt) @@ -57,8 +59,9 @@ def run(): cbs=buildCBS() sourceList.append(cbs) - wkl=buildWeeklyStandard() - sourceList.append(wkl) + #Weekly standard just doesn't update frequently enough + #wkl=buildWeeklyStandard() + #sourceList.append(wkl) fox=buildFoxNews() sourceList.append(fox) diff --git a/parser.py b/parser.py index 5cb1c51..6b7b0a6 100644 --- a/parser.py +++ b/parser.py @@ -176,6 +176,53 @@ def removeBadStories(source, badTitleArr, badDescArr, badAuthorArr, badImgArr, b + +def buildTheHill(): + url='http://thehill.com' + name='The Hill' + + #DOWNLOAD HOMEPAGE CONTENT + content=urlToContent(url) + + #get main headline + h1=content + h1=h1.split('
', 1)[1] + h1=h1.split('', 1)[1] + h2=h2.split('', 1)[0] + while '
', 1)[1] + h3=h3.split('', 1)[0] + while '
')[0] if title[-1]=='/': title=title[:-1].strip() title=title[:-1] + if debugging: + print(title) + + author='' - if sourceName!='The Blaze': - authorTags=['article:author', 'dc.creator'] + if sourceName=='The Blaze': + if 'class="article-author">' in content: + author=content.split('class="article-author">')[1].split('<')[0] + elif 'class="article-author" href="' in content: + author=content.split('class="article-author" href="')[1] + author=author.split('>')[1].split('<')[0].strip() + else: + authorTags=['article:author', 'dc.creator', 'property="author'] for tag in authorTags: if tag in content: author=content.split(tag+'" content=')[1][1:].split('>')[0] author=author[:-1] + #trim an extra quotation mark for The Hill + if sourceName=='The Hill': + author=author.split('"', 1)[0] break - #handle The Blaze + + if debugging: + print(author) + + + if 'og:description' in content: + description=content.split('og:description" content=')[1][1:].split('>')[0] + if description[-1]=='/': + description=description[:-1].strip() + description=description[:-1] else: - if 'class="article-author">' in content: - author=content.split('class="article-author">')[1].split('<')[0] - elif 'class="article-author" href="' in content: - author=content.split('class="article-author" href="')[1] - author=author.split('>')[1].split('<')[0].strip() + if sourceName=='The Hill': + description=content.split('div class="field-items"')[-1] + description=re.sub('<[^<]+?>', '', description) + description=description[1:200] + else: + print("SHOULDN'T GET HERE") - description=content.split('og:description" content=')[1][1:].split('>')[0] - if description[-1]=='/': - description=description[:-1].strip() - description=description[:-1] #strip out self-references description=description.replace(sourceName, 'our') + if debugging: + print(description) + + a=Article(title, url, img, description, sourceName, author) return a -- cgit v1.2.3