2 files changed, 44 insertions, 58 deletions
diff --git a/main.py b/main.py
index 182ae26..c54487e 100755
--- a/main.py
+++ b/main.py
@@ -5,6 +5,7 @@ from unbiasedFunctions import *
 from parser import *
 import time
 
+
 def main():
     while True:
         print('-----------------------')
@@ -23,49 +24,27 @@ def run():
 
     '''
 
-    #for some reason, The Guardian sometimes just doesn't work right?
-    #loop until it gets it right
-    '''
-    h1='https://www.theguardian.com/us'
-    looped=False
-    while h1=='https://www.theguardian.com/us':
-        try:
-            gdn=buildGuardian()
-            h1=gdn.h1Arr[0]
-        except:
-            print('The Guardian: build error. Looping again.')
-        looped=True
-    '''
-    gdn=buildGuardian()
-    sourceList.append(gdn)
-
-    hil=buildTheHill()
-    sourceList.append(hil)
-
-    #nyt=buildNYT()
-    #sourceList.append(nyt)
-
-    npr=buildNPR()
-    sourceList.append(npr)
 
-    blz=buildBlaze()
-    sourceList.append(blz)
-
-    bbc=buildBBC()
-    sourceList.append(bbc)
-
-    nbc=buildNBC()
-    sourceList.append(nbc)
-
-    cbs=buildCBS()
-    sourceList.append(cbs)
-
-    #Weekly standard just doesn't update frequently enough
-    #wkl=buildWeeklyStandard()
-    #sourceList.append(wkl)
-
-    fox=buildFoxNews()
-    sourceList.append(fox)
+    ### These values have to be the second half of the function name
+    ### E.g. Guardian calls buildGuardian(), etc.
+    sourceFnArr=['Guardian', 'TheHill', 'NPR', 'Blaze', 'BBC', 'NBC', 'CBS',
+                 'FoxNews', ]
+    
+    for source in sourceFnArr:
+        tries=0
+        while tries<3:
+            try:
+                fn='build'+source
+                possibles = globals().copy()
+                possibles.update(locals())
+                method = possibles.get(fn)
+                src=method()
+                sourceList.append(src)
+                break
+            except:
+                print('Build error. Looping again: '+source)
+                tries+=1
+                time.sleep(tries)
     
     #scrape all urls and build data structure
     newsSourceArr=buildNewsSourceArr(sourceList)
diff --git a/parser.py b/parser.py
index 21f0669..671e2e5 100755
--- a/parser.py
+++ b/parser.py
@@ -119,7 +119,7 @@ def removeBadStoriesHelper(source, element, badStringList, arr):
         for i in range(len(arr)):
             for hed in arr[i]:
                 if hed==None:
-                    print("////////\nNone type found in removeBadStoriesHelper for "+source+"\n/////////")
+                    print("////////\nNone type found in removeBadStoriesHelper for "+source.name+"\n/////////")
                     break
                 for item in badStringList:
                     if item in getattr(hed, element):
@@ -197,14 +197,23 @@ def buildGuardian():
     url='http://www.theguardian.com/us'
     name='The Guardian US'
 
-    #DOWNLOAD HOMEPAGE CONTENT
-    content=urlToContent(url, 'utf8')
-    
-    #get main headline
-    h1=content
-    h1=h1.split('<h1', 1)[1]
-    h1=h1.split('<a href="', 1)[1]
-    h1=h1.split('"', 1)[0]
+
+    while True:
+        #DOWNLOAD HOMEPAGE CONTENT
+        content=urlToContent(url, 'utf8')
+        
+        #get main headline
+        h1=content
+        h1=h1.split('<h1', 1)[1]
+        h1=h1.split('<a href="', 1)[1]
+        h1=h1.split('"', 1)[0]
+
+        print(h1)
+        if h1!='https://www.theguardian.com/us':
+            break
+        else:
+            print('Guardian loop')
+        
     h1s=[h1]
 
     #GET SECONDARY HEADLINES
@@ -233,7 +242,7 @@ def buildGuardian():
         h3s.append(x)
 
     h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
-
+    
     gdn=buildNewsSource2(name, url, h1s, h2s, h3s)
     gdn=removeBadStories(gdn, None, ['Tom McCarthy', 'Andy Hunter'], ['https://www.theguardian.com/profile/ben-jacobs'], None)
 
@@ -269,7 +278,6 @@ def buildBlaze():
     #get main headline
     h1=content
     h1=h1.split('<!-- home -->', 1)[1]
-    h1=h1.split('<!-- loop-home -->', 1)[0]
     h1=h1.split('<a class="gallery-link" href="', 1)[1]
     h1=h1.split('"', 1)[0]
     h1s=[url+h1]
@@ -279,9 +287,9 @@ def buildBlaze():
     h2s=[]
     h2=h2.split('<!-- home -->', 1)[1]
     h2=h2.split('<!-- loop-home -->', 1)[0]
-    while '</figure>\n\n<figure class="gallery-item">' in h2:
-        h2=h2.split('</figure>\n\n<figure class="gallery-item">', 1)[1]
-        h2=h2.split('href="', 1)[1]
+    while '<a class="gallery-link" href="' in h2:#'</figure>\n\n<figure class="gallery-item">' in h2:
+        h2=h2.split('<a class="gallery-link" href="', 1)[1]#'</figure>\n\n<figure class="gallery-item">', 1)[1]
+        #h2=h2.split('href="', 1)[1]
         x=h2.split('"', 1)[0]
         if h1 not in x:
             h2s.append(url+x)
@@ -299,12 +307,11 @@ def buildBlaze():
 
     h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
 
-
     blz=buildNewsSource2(name, url, h1s, h2s, h3s)
 
     badTitleArr=['Tucker Carlson', 'Mark Levin']
     badDescArr=['Lawrence Jones', 'Mike Slater']
-    badAuthorArr=['Matt Walsh', 'Tomi Lahren', 'Dana Loesch', 'Mike Opelka', 'Chris Salcedo', 'Justin Haskins', 'Sara Gonzales', 'Doc Thompson']
+    badAuthorArr=['Matt Walsh', 'Tomi Lahren', 'Dana Loesch', 'Mike Opelka', 'Chris Salcedo', 'Justin Haskins', 'Sara Gonzales', 'Doc Thompson', 'Glenn Beck']
     badImgArr=None
     badURLArr=None
     blz=removeBadStories(blz, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr)