Added The Hill; also tweaked buildArticle()

author: sstvinc2 <sstvinc2@gmail.com> 2017-02-18 22:23:51 -0600
committer: sstvinc2 <sstvinc2@gmail.com> 2017-02-18 22:23:51 -0600
commit: d1c7dfc9c2a47edf80527c2457481b9508087ce6 (patch)
tree: 4db738c4d308dbcbb2300f8e304635bdd023f322
parent: f19dd7a3291e2d61d4d76eef5300df522193fa1e (diff)
3 files changed, 98 insertions, 21 deletions
diff --git a/main.py b/main.py
index 8318da0..f381db5 100644
--- a/main.py
+++ b/main.py
@@ -20,10 +20,12 @@ def run():
     SOURCES TO ADD NEXT:
     -ABC
     -REUTERS
-    -THE HILL
 
     '''
 
+    hil=buildTheHill()
+    sourceList.append(hil)
+
     nyt=buildNYT()
     sourceList.append(nyt)
 
@@ -57,8 +59,9 @@ def run():
     cbs=buildCBS()
     sourceList.append(cbs)
 
-    wkl=buildWeeklyStandard()
-    sourceList.append(wkl)
+    #Weekly standard just doesn't update frequently enough
+    #wkl=buildWeeklyStandard()
+    #sourceList.append(wkl)
 
     fox=buildFoxNews()
     sourceList.append(fox)
diff --git a/parser.py b/parser.py
index 5cb1c51..6b7b0a6 100644
--- a/parser.py
+++ b/parser.py
@@ -176,6 +176,53 @@ def removeBadStories(source, badTitleArr, badDescArr, badAuthorArr, badImgArr, b
 
 
 
+
+def buildTheHill():
+    url='http://thehill.com'
+    name='The Hill'
+
+    #DOWNLOAD HOMEPAGE CONTENT
+    content=urlToContent(url)
+
+    #get main headline
+    h1=content
+    h1=h1.split('<div class="headline-story-image">', 1)[1]
+    h1=h1.split('<a href="', 1)[1]
+    h1=h1.split('"', 1)[0]
+    h1s=[url+h1]
+
+    #GET SECONDARY HEADLINES
+    h2=content
+    h2s=[]
+    h2=h2.split('<div class="section-top-content">', 1)[1]
+    h2=h2.split('</ul>', 1)[0]
+    while '<div class="top-story-item' in h2 and len(h2s)<4:
+        h2=h2.split('<div class="top-story-item', 1)[1]
+        x=h2.split('<a href="', 1)[1]
+        x=x.split('"', 1)[0]
+        h2s.append(url+x)
+
+    #GET TERTIARY HEADLINES
+    h3=content
+    h3s=[]
+    h3=h3.split('<div class="section-top-content">', 1)[1]
+    h3=h3.split('</ul>', 1)[0]
+    while '<div class="top-story-item small' in h3:
+        h3=h3.split('<div class="top-story-item small', 1)[1]
+        x=h3.split('<a href="', 1)[1]
+        x=x.split('"', 1)[0]
+        h3s.append(url+x)
+
+    h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
+    hil=buildNewsSource2(name, url, h1s, h2s, h3s)
+    #hil=removeBadStories(gdn, None, None, None, None)
+
+    return hil
+
+
+
+
+
 def buildGuardian():
     url='http://www.theguardian.com/us-news'
     name='The Guardian'
@@ -626,7 +673,7 @@ def buildFoxNews():
     badDescArr=None
     badAuthorArr=['Bill O\'Reilly', 'Sean Hannity']
     badImgArr=['http://www.foxnews.com/content/dam/fox-news/logo/og-fn-foxnews.jpg']
-    badURLArr=['http://www.foxnews.com/opinion']
+    badURLArr=['http://www.foxnews.com/opinion', 'videos.foxnews.com']
     fox=removeBadStories(fox, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr)
 
     return fox
diff --git a/unbiasedFunctions.py b/unbiasedFunctions.py
index ef0265f..cab7681 100644
--- a/unbiasedFunctions.py
+++ b/unbiasedFunctions.py
@@ -2,16 +2,17 @@ from unbiasedObjects import *
 import os
 import random
 import time
+import re
 
 
 #take in a url and delimiters, return twitter card
 def buildArticle(url, sourceName):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd):
 
-    '''#PRINT DEBUGGING
-    print(sourceName)
-    print(url)
-    print()
-    '''
+    debugging=False
+    if debugging:
+        print(sourceName)
+        print(url)
+        print()
     
     #download url
     os.system('wget -q -O scratch/temp_article.html --no-check-certificate '+url)
@@ -47,34 +48,60 @@ def buildArticle(url, sourceName):#, titleDelStart, titleDelEnd, imgDelStart, im
                 img=img[:-1].strip()
             img=img[:-1]
 
+        if debugging:
+            print(img)
+
         title=content.split('og:title" content=')[1][1:].split('>')[0]
         if title[-1]=='/':
             title=title[:-1].strip()
         title=title[:-1]
 
+        if debugging:
+            print(title)
+
+
         author=''
-        if sourceName!='The Blaze':
-            authorTags=['article:author', 'dc.creator']
+        if sourceName=='The Blaze':
+            if 'class="article-author">' in content:
+                author=content.split('class="article-author">')[1].split('<')[0]
+            elif 'class="article-author" href="' in content:
+                author=content.split('class="article-author" href="')[1]
+                author=author.split('>')[1].split('<')[0].strip()
+        else:
+            authorTags=['article:author', 'dc.creator', 'property="author']
             for tag in authorTags:
                 if tag in content:
                     author=content.split(tag+'" content=')[1][1:].split('>')[0]
                     author=author[:-1]
+                    #trim an extra quotation mark for The Hill
+                    if sourceName=='The Hill':
+                        author=author.split('"', 1)[0]
                     break
-        #handle The Blaze
+
+        if debugging:
+            print(author)
+
+
+        if 'og:description' in content:
+            description=content.split('og:description" content=')[1][1:].split('>')[0]
+            if description[-1]=='/':
+                description=description[:-1].strip()
+            description=description[:-1]
         else:
-            if 'class="article-author">' in content:
-                author=content.split('class="article-author">')[1].split('<')[0]
-            elif 'class="article-author" href="' in content:
-                author=content.split('class="article-author" href="')[1]
-                author=author.split('>')[1].split('<')[0].strip()
+            if sourceName=='The Hill':
+                description=content.split('div class="field-items"')[-1]
+                description=re.sub('<[^<]+?>', '', description)
+                description=description[1:200]
+            else:
+                print("SHOULDN'T GET HERE")
 
-        description=content.split('og:description" content=')[1][1:].split('>')[0]
-        if description[-1]=='/':
-            description=description[:-1].strip()
-        description=description[:-1]
         #strip out self-references
         description=description.replace(sourceName, 'our')
 
+        if debugging:
+            print(description)
+
+
         a=Article(title, url, img, description, sourceName, author)
         return a
author	sstvinc2 <sstvinc2@gmail.com>	2017-02-18 22:23:51 -0600
committer	sstvinc2 <sstvinc2@gmail.com>	2017-02-18 22:23:51 -0600
commit	d1c7dfc9c2a47edf80527c2457481b9508087ce6 (patch)
tree	4db738c4d308dbcbb2300f8e304635bdd023f322
parent	f19dd7a3291e2d61d4d76eef5300df522193fa1e (diff)