From ef0dc339f42c6befd07f0d626c1eaed8ad7ee057 Mon Sep 17 00:00:00 2001
From: ssstvinc2 <sstvinc2@gmail.com>
Date: Fri, 24 Mar 2017 19:05:54 -0400
Subject: Added ABC News, some parser fixes as well

---
 main.py              |   4 +-
 parser.py            | 124 +++++++++++++++++++++++++++++++++++++++++++++++++--
 spotCheck.py         |   4 +-
 unbiasedFunctions.py |   5 ++-
 4 files changed, 128 insertions(+), 9 deletions(-)
diff --git a/main.py b/main.py
index 735ff6b..a109d2f 100755
--- a/main.py
+++ b/main.py
@@ -21,9 +21,7 @@ def run():
     SOURCES TO ADD NEXT:
     -ABC
     -REUTERS
-    -Christian Science Monitor
     -Town Hall
-    -Washington Times
 
     '''
 
@@ -31,7 +29,7 @@ def run():
     ### These values have to be the second half of the function name
     ### E.g. Guardian calls buildGuardian(), etc.
     sourceFnArr=['Guardian', 'TheHill', 'NPR', 'BBC', 'NBC', 'CBS',
-                 'FoxNews', 'WashTimes'] #'Blaze'
+                 'FoxNews', 'WashTimes', 'CSM', 'ABC'] #'Blaze'
     
     for source in sourceFnArr:
         tries=0
diff --git a/parser.py b/parser.py
index 942612a..cf56d13 100755
--- a/parser.py
+++ b/parser.py
@@ -32,19 +32,28 @@ Returns a newsSource2 object
 '''
 def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs):
     h1Arr=[]
-    h1Arr.append(buildArticle(h1URLs[0], name))
+    a=buildArticle(h1URLs[0], name)
+    if a==None:
+        print('................\nH1 Nonetype in '+name+'\n................')
+    else:
+        h1Arr.append(a)
 
     h2Arr=[]
     for x in h2URLs:
         a=buildArticle(x, name)
         if a!=None:
             h2Arr.append(a)
+        else:
+            print('................\nH2 Nonetype in '+name+'\n................')
 
+            
     h3Arr=[]
     for x in h3URLs:
         a=buildArticle(x, name)
         if a!=None:
             h3Arr.append(a)
+        else:
+            print('................\nH3 Nonetype in '+name+'\n................')
 
     #BUILD THE NEWS SOURCE
     newsSource=NewsSource2(name, url, h1Arr, h2Arr, h3Arr)
@@ -268,8 +277,6 @@ def buildWashTimes():
     #GET SECONDARY HEADLINES
     h2=content
     h2s=[]
-    #only the h1 and the two h2s have this, so split on it and grab
-    #the second two
     h2=h2.split('class="top-news', 1)[1]
     h2=h2.split('</article>', 1)[1] #end of top-news article
     h2=h2.split('<article ', 1)[0] #note the space; we want unclassed articles
@@ -299,6 +306,60 @@ def buildWashTimes():
     return wat
 
 
+def buildCSM():
+    url='http://www.csmonitor.com'
+    name='Christian Science Monitor'
+
+
+    #DOWNLOAD HOMEPAGE CONTENT
+    content=urlToContent(url)
+    
+    #get main headline
+    h1=content
+    h1=h1.split('ui-top-center', 1)[1]
+    h1=h1.split('<a href="', 1)[1]
+    h1=h1.split('"', 1)[0]
+
+    h1s=[url+h1]
+
+    #GET SECONDARY HEADLINES
+    h2=content
+    h2s=[]
+    h2=h2.split('block-3-1', 1)[1]
+    h2=h2.split('ui-top-right', 1)[0]
+    h2=h2.split('<h3 class="story_headline">')[1:]
+    
+    for x in h2:
+        x=x.split('<a href="', 1)[1]
+        x=x.split('"', 1)[0]
+        h2s.append(url+x)
+
+    #GET TERTIARY HEADLINES
+    h3=content
+    h3s=[]
+    h3=h3.split('block-2-1', 1)[1]
+    h3=h3.split('block-2-2', 1)[0]
+    h3=h3.split('<h3 class="story_headline')[1:]
+    
+    for x in h3:
+        x=x.split('<a href="', 2)[-1]
+        x=x.split('"', 1)[0]
+        h3s.append(url+x)
+
+    h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
+
+    csm=buildNewsSource2(name, url, h1s, h2s, h3s)
+
+    badTitleArr=['Change Agent']
+    badDescArr=None
+    badAuthorArr=None
+    badImgArr=['csm_logo']
+    badURLArr=['difference-maker']
+    csm=removeBadStories(csm, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr)
+
+    return csm
+
+
 
 '''
 Function to fix the oddly short og:descriptions provided
@@ -679,6 +740,61 @@ def buildNPR():
 
 
 
+
+def buildABC():
+    url='http://www.abcnews.go.com'
+    name='ABC News'
+
+    #DOWNLOAD HOMEPAGE CONTENT
+    content=urlToContent(url)
+    
+    #get main headline
+    h1=content
+    h1=h1.split('id="row-1"', 1)[1]
+    h1=h1.split('<a href="', 1)[1]
+    h1=h1.split('"', 1)[0]
+    h1s=[h1]
+
+    #GET SECONDARY HEADLINES
+    h2=content
+    h2s=[]
+    h2=h2.split('id="row-2"', 1)[1]
+    h2=h2.split('id="row-3"', 1)[0]
+    h2=h2.split('card single row-item')[1:3] #should just be 2 of these
+    for x in h2:
+        x=x.split('<a href="', 1)[1]
+        x=x.split('"', 1)[0]
+        if h1 not in x:
+            h2s.append(x)
+
+    #GET TERTIARY HEADLINES
+    h3=content
+    h3s=[]
+    h3=h3.split('id="row-1"', 1)[1]
+    h3=h3.split('tab-data active', 1)[1]
+    h3=h3.split('tab-data"', 1)[0] #note the trailing quotation
+    while '<a href="' in h3:
+        h3=h3.split('<a href="', 1)[1]
+        x=h3.split('"', 1)[0]
+        if h1 not in x:
+            h3s.append(x)
+
+    h1s, h2s, h3s = removeDuplicates([h1], h2s, h3s)
+    abc=buildNewsSource2(name, url, h1s, h2s, h3s)
+
+    #REMOVE BAD STORIES
+    badTitleArr=None
+    badDescArr=None
+    badAuthorArr=None
+    badImgArr=None
+    badURLArr=None
+    abc=removeBadStories(abc, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr)
+
+    return abc
+
+
+
+
 def buildFoxNews():
     url='http://foxnews.com'
     name='Fox News'
@@ -718,7 +834,7 @@ def buildFoxNews():
     fox=buildNewsSource2(name, url, h1s, h2s, h3s)
 
     #REMOVE BAD STORIES
-    badTitleArr=['O&#039;Reilly', 'Fox News', 'Brett Baier']
+    badTitleArr=['O&#039;Reilly', 'Fox News', 'Brett Baier', 'Tucker']
     badDescArr=['Sean Hannity']
     badAuthorArr=['Bill O\'Reilly', 'Sean Hannity']
     badImgArr=['http://www.foxnews.com/content/dam/fox-news/logo/og-fn-foxnews.jpg']
diff --git a/spotCheck.py b/spotCheck.py
index 7bf46bb..d1edda4 100755
--- a/spotCheck.py
+++ b/spotCheck.py
@@ -15,7 +15,9 @@ def spotCheck(src):
            'blz' : buildBlaze,
            'bbc' : buildBBC,
            'nbc' : buildNBC,
-           'wat' : buildWashTimes}
+           'wat' : buildWashTimes,
+           'csm' : buildCSM,
+           'abc' : buildABC}
 
     data=fns[src]()
 
diff --git a/unbiasedFunctions.py b/unbiasedFunctions.py
index fca2f2d..1a80d7a 100644
--- a/unbiasedFunctions.py
+++ b/unbiasedFunctions.py
@@ -41,7 +41,10 @@ def buildArticle(url, sourceName, encoding=None):#, titleDelStart, titleDelEnd,
                 img=img.split('src="', 1)[1].split('"')[0]
             
         else:
-            img=content.split('og:image" content=')[1][1:].split('>')[0]
+            if 'og:image' in content:
+                img=content.split('og:image" content=')[1][1:].split('>')[0]
+            elif sourceName=='ABC News':
+                img='https://c1.staticflickr.com/7/6042/6276688407_12900948a2_b.jpgX'
             if img[-1]=='/':
                 #because the quote separator could be ' or ", 
                 #trim to just before it then lop it off
-- 
cgit v1.2.3