summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorsstvinc2 <sstvinc2@gmail.com>2017-02-18 22:23:51 -0600
committersstvinc2 <sstvinc2@gmail.com>2017-02-18 22:23:51 -0600
commitd1c7dfc9c2a47edf80527c2457481b9508087ce6 (patch)
tree4db738c4d308dbcbb2300f8e304635bdd023f322
parentf19dd7a3291e2d61d4d76eef5300df522193fa1e (diff)
Added The Hill; also tweaked buildArticle()
-rw-r--r--main.py9
-rw-r--r--parser.py49
-rw-r--r--unbiasedFunctions.py61
3 files changed, 98 insertions, 21 deletions
diff --git a/main.py b/main.py
index 8318da0..f381db5 100644
--- a/main.py
+++ b/main.py
@@ -20,10 +20,12 @@ def run():
SOURCES TO ADD NEXT:
-ABC
-REUTERS
- -THE HILL
'''
+ hil=buildTheHill()
+ sourceList.append(hil)
+
nyt=buildNYT()
sourceList.append(nyt)
@@ -57,8 +59,9 @@ def run():
cbs=buildCBS()
sourceList.append(cbs)
- wkl=buildWeeklyStandard()
- sourceList.append(wkl)
+ #Weekly standard just doesn't update frequently enough
+ #wkl=buildWeeklyStandard()
+ #sourceList.append(wkl)
fox=buildFoxNews()
sourceList.append(fox)
diff --git a/parser.py b/parser.py
index 5cb1c51..6b7b0a6 100644
--- a/parser.py
+++ b/parser.py
@@ -176,6 +176,53 @@ def removeBadStories(source, badTitleArr, badDescArr, badAuthorArr, badImgArr, b
+
+def buildTheHill():
+ url='http://thehill.com'
+ name='The Hill'
+
+ #DOWNLOAD HOMEPAGE CONTENT
+ content=urlToContent(url)
+
+ #get main headline
+ h1=content
+ h1=h1.split('<div class="headline-story-image">', 1)[1]
+ h1=h1.split('<a href="', 1)[1]
+ h1=h1.split('"', 1)[0]
+ h1s=[url+h1]
+
+ #GET SECONDARY HEADLINES
+ h2=content
+ h2s=[]
+ h2=h2.split('<div class="section-top-content">', 1)[1]
+ h2=h2.split('</ul>', 1)[0]
+ while '<div class="top-story-item' in h2 and len(h2s)<4:
+ h2=h2.split('<div class="top-story-item', 1)[1]
+ x=h2.split('<a href="', 1)[1]
+ x=x.split('"', 1)[0]
+ h2s.append(url+x)
+
+ #GET TERTIARY HEADLINES
+ h3=content
+ h3s=[]
+ h3=h3.split('<div class="section-top-content">', 1)[1]
+ h3=h3.split('</ul>', 1)[0]
+ while '<div class="top-story-item small' in h3:
+ h3=h3.split('<div class="top-story-item small', 1)[1]
+ x=h3.split('<a href="', 1)[1]
+ x=x.split('"', 1)[0]
+ h3s.append(url+x)
+
+ h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
+ hil=buildNewsSource2(name, url, h1s, h2s, h3s)
+ #hil=removeBadStories(gdn, None, None, None, None)
+
+ return hil
+
+
+
+
+
def buildGuardian():
url='http://www.theguardian.com/us-news'
name='The Guardian'
@@ -626,7 +673,7 @@ def buildFoxNews():
badDescArr=None
badAuthorArr=['Bill O\'Reilly', 'Sean Hannity']
badImgArr=['http://www.foxnews.com/content/dam/fox-news/logo/og-fn-foxnews.jpg']
- badURLArr=['http://www.foxnews.com/opinion']
+ badURLArr=['http://www.foxnews.com/opinion', 'videos.foxnews.com']
fox=removeBadStories(fox, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr)
return fox
diff --git a/unbiasedFunctions.py b/unbiasedFunctions.py
index ef0265f..cab7681 100644
--- a/unbiasedFunctions.py
+++ b/unbiasedFunctions.py
@@ -2,16 +2,17 @@ from unbiasedObjects import *
import os
import random
import time
+import re
#take in a url and delimiters, return twitter card
def buildArticle(url, sourceName):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd):
- '''#PRINT DEBUGGING
- print(sourceName)
- print(url)
- print()
- '''
+ debugging=False
+ if debugging:
+ print(sourceName)
+ print(url)
+ print()
#download url
os.system('wget -q -O scratch/temp_article.html --no-check-certificate '+url)
@@ -47,34 +48,60 @@ def buildArticle(url, sourceName):#, titleDelStart, titleDelEnd, imgDelStart, im
img=img[:-1].strip()
img=img[:-1]
+ if debugging:
+ print(img)
+
title=content.split('og:title" content=')[1][1:].split('>')[0]
if title[-1]=='/':
title=title[:-1].strip()
title=title[:-1]
+ if debugging:
+ print(title)
+
+
author=''
- if sourceName!='The Blaze':
- authorTags=['article:author', 'dc.creator']
+ if sourceName=='The Blaze':
+ if 'class="article-author">' in content:
+ author=content.split('class="article-author">')[1].split('<')[0]
+ elif 'class="article-author" href="' in content:
+ author=content.split('class="article-author" href="')[1]
+ author=author.split('>')[1].split('<')[0].strip()
+ else:
+ authorTags=['article:author', 'dc.creator', 'property="author']
for tag in authorTags:
if tag in content:
author=content.split(tag+'" content=')[1][1:].split('>')[0]
author=author[:-1]
+ #trim an extra quotation mark for The Hill
+ if sourceName=='The Hill':
+ author=author.split('"', 1)[0]
break
- #handle The Blaze
+
+ if debugging:
+ print(author)
+
+
+ if 'og:description' in content:
+ description=content.split('og:description" content=')[1][1:].split('>')[0]
+ if description[-1]=='/':
+ description=description[:-1].strip()
+ description=description[:-1]
else:
- if 'class="article-author">' in content:
- author=content.split('class="article-author">')[1].split('<')[0]
- elif 'class="article-author" href="' in content:
- author=content.split('class="article-author" href="')[1]
- author=author.split('>')[1].split('<')[0].strip()
+ if sourceName=='The Hill':
+ description=content.split('div class="field-items"')[-1]
+ description=re.sub('<[^<]+?>', '', description)
+ description=description[1:200]
+ else:
+ print("SHOULDN'T GET HERE")
- description=content.split('og:description" content=')[1][1:].split('>')[0]
- if description[-1]=='/':
- description=description[:-1].strip()
- description=description[:-1]
#strip out self-references
description=description.replace(sourceName, 'our')
+ if debugging:
+ print(description)
+
+
a=Article(title, url, img, description, sourceName, author)
return a