modularized code a bit, and added Fox News with new parser

author: sstvinc2 <sstvinc2@gmail.com> 2017-02-14 21:48:10 -0600
committer: sstvinc2 <sstvinc2@gmail.com> 2017-02-14 21:48:10 -0600
commit: c0a52698826fba2aeb5c2889f3856f051db1052c (patch)
tree: b26190c77ad99a5400c7fa0f64d29537b90bee53
parent: 7ceea6a5a495302ffdec9921ea9f841a2b6df8c2 (diff)
2 files changed, 136 insertions, 66 deletions
diff --git a/main.py b/main.py
index 63fd908..e26c8c2 100644
--- a/main.py
+++ b/main.py
@@ -71,31 +71,8 @@ def run():
     nyt=buildNYT()
     sourceList.append(nyt)
 
-    '''
-    sourceList.append(NewsSource('New York Times',
-                                 'http://nytimes.com',
-                                 ['<a href="'],
-                                 ['<article class="story theme-summary"', '<a href="'],
-                                 ['<hr class="single-rule"', 'article class="story theme-summary', 'h2 class="story-heading"><a href="'],
-                                 '<div class="b-column column">', '<!-- close photo-spot-region -->',
-                                 'section id="top-news" class="top-news"', '</div><!-- close a-column -->',
-                                 'class="second-column-region region"', 'html.geo-dma-501 .nythpNYRegionPromo'))
-    '''
-
-
-
-
-    sourceList.append(NewsSource('Fox News',
-                                 'http://foxnews.com',
-                                 ['<h1><a href="'],
-                                 ['<li data-vr-contentbox=""><a href="'],
-                                 [],
-                                 None, None,
-                                 '<div class="top-stories">', '<section id="latest"',
-                                 None, None))
-
-
-
+    fox=buildFoxNews()
+    sourceList.append(fox)
     
     #scrape all urls and build data structure
     newsSourceArr=buildNewsSourceArr(sourceList)
diff --git a/parser.py b/parser.py
index 2020f55..16382ab 100644
--- a/parser.py
+++ b/parser.py
@@ -4,9 +4,12 @@ from unbiasedObjects import *
 from unbiasedFunctions import buildArticle
 import os
 
-def buildNYT():
-    url='http://www.nytimes.com'
 
+'''
+Takes in a URL, downloads the file to a temp file,
+reads the file into a string, and returns that string
+'''
+def urlToContent(url):
     #download file
     os.system('wget -q -O scratch/temp1.html --no-check-certificate '+url)
     
@@ -15,6 +18,131 @@ def buildNYT():
     content=f.read()
     f.close()
 
+    return content
+
+
+'''
+Creates a new newsSource2 object. For each URL in h1-h3URLs,
+calls the file scraper and appends the new Article object.
+Returns a newsSource2 object
+'''
+def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs):
+    h1Arr=[]
+    h1Arr.append(buildArticle(h1URLs[0], name))
+
+    h2Arr=[]
+    for x in h2URLs:
+        h2Arr.append(buildArticle(x, name))
+
+    h3Arr=[]
+    for x in h3URLs:
+        h3Arr.append(buildArticle(x, name))
+
+    #BUILD THE NEWS SOURCE
+    newsSource=NewsSource2(name, url, h1Arr, h2Arr, h3Arr)
+
+    return newsSource
+
+
+'''
+Some sites will replicate URLs across the page. This function removes them.
+Check hierarchically: if h3 exists in h1s or h2s, remove from h3s;
+if h2 exists in h1s, remove from h2s
+
+also check partial URLs (e.g. nytimes.com/story.html is the same as
+nytimes.com/story.html?var=x
+'''
+def removeDuplicates(h1s, h2s, h3s):
+    #Assume h1s is one element, and keep it
+
+    #remove h2 duplicates
+    removeArr=[]
+    for i in range(len(h2s)):
+        #check internally
+        for j in range(len(h2s)):
+            if i==j:
+                continue
+            else:
+                if h2s[i] in h2s[j]:
+                    removeArr.append(h2s[j])
+        #check against h1s
+        for k in range(len(h1s)):
+            if (h2s[i] in h1s[k]) or (h1s[k] in h2s[i]):
+                removeArr.append(h2s[i])
+    for x in removeArr:
+        h2s.remove(x)
+    
+    #remove h3 duplicates
+    removeArr=[]
+    for i in range(len(h3s)):
+        #check internally
+        for j in range(len(h3s)):
+            if i==j:
+                continue
+            else:
+                if h3s[i] in h3s[j]:
+                    removeArr.append(h3s[j])
+        #check against h1s and h2s
+        h1and2=h1s+h2s
+        for k in range(len(h1and2)):
+            if (h3s[i] in h1and2[k]) or (h1and2[k] in h3s[i]):
+                removeArr.append(h3s[i])
+    for x in removeArr:
+        h3s.remove(x)
+    
+
+    return h1s, h2s, h3s
+    
+
+def buildFoxNews():
+    url='http://foxnews.com'
+    name='Fox News'
+
+    #DOWNLOAD HOMEPAGE CONTENT
+    content=urlToContent(url)
+    
+    #get main headline
+    h1=content
+    h1=h1.split('<h1><a href="', 1)[1]
+    h1=h1.split('"', 1)[0]
+    h1s=[h1]
+
+    #GET SECONDARY HEADLINES
+    h2=content
+    h2s=[]
+    h2=h2.split('<div class="top-stories">', 1)[1]
+    h2=h2.split('<section id="latest"', 1)[0]
+    while '<li data-vr-contentbox=""><a href="' in h2:
+        h2=h2.split('<li data-vr-contentbox=""><a href="', 1)[1]
+        x=h2.split('"', 1)[0]
+        if h1 not in x:
+            h2s.append(x)
+
+    #GET TERTIARY HEADLINES
+    h3=content
+    h3s=[]
+    h3=h3.split('div id="big-top"', 1)[1]
+    h3=h3.split('<div class="top-stories">', 1)[0]
+    while '<a href="' in h3:
+        h3=h3.split('<a href="', 1)[1]
+        x=h3.split('"', 1)[0]
+        if h1 not in x:
+            h3s.append(x)
+
+    h1s, h2s, h3s = removeDuplicates([h1], h2s, h3s)
+    fox=buildNewsSource2(name, url, h1s, h2s, h3s)
+
+    return fox
+
+
+
+def buildNYT():
+    url='http://www.nytimes.com'
+    name='New York Times'
+
+    #DOWNLOAD HOMEPAGE CONTENT
+    content=urlToContent(url)
+
     #get main headline
     #this will likely need if/else logic
     h1=content
@@ -24,7 +152,7 @@ def buildNYT():
     h1=h1.split('<a href="', 1)[1]
     h1=h1.split('"', 1)[0]
 
-    #GET SECONARY HEADLINES
+    #GET SECONDARY HEADLINES
     #This comes from the a column or b column, above the break
     h2=content
     h2s=[]
@@ -57,19 +185,6 @@ def buildNYT():
         if (h1 not in x) and (x not in h2s):
             h2s.append(x)
 
-    #REMOVE DUPLICATES
-    removeArr=[]
-    for i in range(len(h2s)):
-        for j in range(len(h2s)):
-            if i==j:
-                continue
-            else:
-                if h2s[i] in h2s[j]:
-                    removeArr.append(h2s[j])
-    for x in removeArr:
-        h2s.remove(x)
-
-
     #GET TERTIARY HEADLINES
     h3=content
     h3s=[]
@@ -86,32 +201,9 @@ def buildNYT():
         x=h3.split('"', 1)[0]
         if (h1 not in x) and (x not in h3s):
             h3s.append(x)
-    #REMOVE DUPLICATES
-    removeArr=[]
-    for i in range(len(h3s)):
-        for j in range(len(h3s)):
-            if i==j:
-                continue
-            else:
-                if h3s[i] in h3s[j]:
-                    removeArr.append(h3s[j])
-    for x in removeArr:
-        h3s.remove(x)
-
-
-    #BUILD THE ARTICLES BASED ON URLS
-    h1Arr=[]
-    h1Arr.append(buildArticle(h1, 'New York Times'))
-
-    h2Arr=[]
-    for x in h2s:
-        h2Arr.append(buildArticle(x, 'New York Times'))
-
-    h3Arr=[]
-    for x in h3s:
-        h3Arr.append(buildArticle(x, 'New York Times'))
 
-    nyt=NewsSource2('New York Times', 'http://nytimes.com', h1Arr, h2Arr, h3Arr)
+    h1s, h2s, h3s = removeDuplicates([h1], h2s, h3s)
+    nyt=buildNewsSource2(name, url, h1s, h2s, h3s)
 
     return nyt
 
@@ -119,6 +211,7 @@ def buildNYT():
 
 
 '''
+NYT
 EXAMPLE OF BIG HEADLINE SPANNING BOTH A AND B COLUMNS
 
 <div class="span-ab-layout layout">
author	sstvinc2 <sstvinc2@gmail.com>	2017-02-14 21:48:10 -0600
committer	sstvinc2 <sstvinc2@gmail.com>	2017-02-14 21:48:10 -0600
commit	c0a52698826fba2aeb5c2889f3856f051db1052c (patch)
tree	b26190c77ad99a5400c7fa0f64d29537b90bee53
parent	7ceea6a5a495302ffdec9921ea9f841a2b6df8c2 (diff)