summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorsstvinc2 <sstvinc2@gmail.com>2017-02-14 21:48:10 -0600
committersstvinc2 <sstvinc2@gmail.com>2017-02-14 21:48:10 -0600
commitc0a52698826fba2aeb5c2889f3856f051db1052c (patch)
treeb26190c77ad99a5400c7fa0f64d29537b90bee53
parent7ceea6a5a495302ffdec9921ea9f841a2b6df8c2 (diff)
modularized code a bit, and added Fox News with new parser
-rw-r--r--main.py27
-rw-r--r--parser.py175
2 files changed, 136 insertions, 66 deletions
diff --git a/main.py b/main.py
index 63fd908..e26c8c2 100644
--- a/main.py
+++ b/main.py
@@ -71,31 +71,8 @@ def run():
nyt=buildNYT()
sourceList.append(nyt)
- '''
- sourceList.append(NewsSource('New York Times',
- 'http://nytimes.com',
- ['<a href="'],
- ['<article class="story theme-summary"', '<a href="'],
- ['<hr class="single-rule"', 'article class="story theme-summary', 'h2 class="story-heading"><a href="'],
- '<div class="b-column column">', '<!-- close photo-spot-region -->',
- 'section id="top-news" class="top-news"', '</div><!-- close a-column -->',
- 'class="second-column-region region"', 'html.geo-dma-501 .nythpNYRegionPromo'))
- '''
-
-
-
-
- sourceList.append(NewsSource('Fox News',
- 'http://foxnews.com',
- ['<h1><a href="'],
- ['<li data-vr-contentbox=""><a href="'],
- [],
- None, None,
- '<div class="top-stories">', '<section id="latest"',
- None, None))
-
-
-
+ fox=buildFoxNews()
+ sourceList.append(fox)
#scrape all urls and build data structure
newsSourceArr=buildNewsSourceArr(sourceList)
diff --git a/parser.py b/parser.py
index 2020f55..16382ab 100644
--- a/parser.py
+++ b/parser.py
@@ -4,9 +4,12 @@ from unbiasedObjects import *
from unbiasedFunctions import buildArticle
import os
-def buildNYT():
- url='http://www.nytimes.com'
+'''
+Takes in a URL, downloads the file to a temp file,
+reads the file into a string, and returns that string
+'''
+def urlToContent(url):
#download file
os.system('wget -q -O scratch/temp1.html --no-check-certificate '+url)
@@ -15,6 +18,131 @@ def buildNYT():
content=f.read()
f.close()
+ return content
+
+
+'''
+Creates a new newsSource2 object. For each URL in h1-h3URLs,
+calls the file scraper and appends the new Article object.
+Returns a newsSource2 object
+'''
+def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs):
+ h1Arr=[]
+ h1Arr.append(buildArticle(h1URLs[0], name))
+
+ h2Arr=[]
+ for x in h2URLs:
+ h2Arr.append(buildArticle(x, name))
+
+ h3Arr=[]
+ for x in h3URLs:
+ h3Arr.append(buildArticle(x, name))
+
+ #BUILD THE NEWS SOURCE
+ newsSource=NewsSource2(name, url, h1Arr, h2Arr, h3Arr)
+
+ return newsSource
+
+
+'''
+Some sites will replicate URLs across the page. This function removes them.
+Check hierarchically: if h3 exists in h1s or h2s, remove from h3s;
+if h2 exists in h1s, remove from h2s
+
+also check partial URLs (e.g. nytimes.com/story.html is the same as
+nytimes.com/story.html?var=x
+'''
+def removeDuplicates(h1s, h2s, h3s):
+ #Assume h1s is one element, and keep it
+
+ #remove h2 duplicates
+ removeArr=[]
+ for i in range(len(h2s)):
+ #check internally
+ for j in range(len(h2s)):
+ if i==j:
+ continue
+ else:
+ if h2s[i] in h2s[j]:
+ removeArr.append(h2s[j])
+ #check against h1s
+ for k in range(len(h1s)):
+ if (h2s[i] in h1s[k]) or (h1s[k] in h2s[i]):
+ removeArr.append(h2s[i])
+ for x in removeArr:
+ h2s.remove(x)
+
+ #remove h3 duplicates
+ removeArr=[]
+ for i in range(len(h3s)):
+ #check internally
+ for j in range(len(h3s)):
+ if i==j:
+ continue
+ else:
+ if h3s[i] in h3s[j]:
+ removeArr.append(h3s[j])
+ #check against h1s and h2s
+ h1and2=h1s+h2s
+ for k in range(len(h1and2)):
+ if (h3s[i] in h1and2[k]) or (h1and2[k] in h3s[i]):
+ removeArr.append(h3s[i])
+ for x in removeArr:
+ h3s.remove(x)
+
+
+ return h1s, h2s, h3s
+
+
+def buildFoxNews():
+ url='http://foxnews.com'
+ name='Fox News'
+
+ #DOWNLOAD HOMEPAGE CONTENT
+ content=urlToContent(url)
+
+ #get main headline
+ h1=content
+ h1=h1.split('<h1><a href="', 1)[1]
+ h1=h1.split('"', 1)[0]
+ h1s=[h1]
+
+ #GET SECONDARY HEADLINES
+ h2=content
+ h2s=[]
+ h2=h2.split('<div class="top-stories">', 1)[1]
+ h2=h2.split('<section id="latest"', 1)[0]
+ while '<li data-vr-contentbox=""><a href="' in h2:
+ h2=h2.split('<li data-vr-contentbox=""><a href="', 1)[1]
+ x=h2.split('"', 1)[0]
+ if h1 not in x:
+ h2s.append(x)
+
+ #GET TERTIARY HEADLINES
+ h3=content
+ h3s=[]
+ h3=h3.split('div id="big-top"', 1)[1]
+ h3=h3.split('<div class="top-stories">', 1)[0]
+ while '<a href="' in h3:
+ h3=h3.split('<a href="', 1)[1]
+ x=h3.split('"', 1)[0]
+ if h1 not in x:
+ h3s.append(x)
+
+ h1s, h2s, h3s = removeDuplicates([h1], h2s, h3s)
+ fox=buildNewsSource2(name, url, h1s, h2s, h3s)
+
+ return fox
+
+
+
+def buildNYT():
+ url='http://www.nytimes.com'
+ name='New York Times'
+
+ #DOWNLOAD HOMEPAGE CONTENT
+ content=urlToContent(url)
+
#get main headline
#this will likely need if/else logic
h1=content
@@ -24,7 +152,7 @@ def buildNYT():
h1=h1.split('<a href="', 1)[1]
h1=h1.split('"', 1)[0]
- #GET SECONARY HEADLINES
+ #GET SECONDARY HEADLINES
#This comes from the a column or b column, above the break
h2=content
h2s=[]
@@ -57,19 +185,6 @@ def buildNYT():
if (h1 not in x) and (x not in h2s):
h2s.append(x)
- #REMOVE DUPLICATES
- removeArr=[]
- for i in range(len(h2s)):
- for j in range(len(h2s)):
- if i==j:
- continue
- else:
- if h2s[i] in h2s[j]:
- removeArr.append(h2s[j])
- for x in removeArr:
- h2s.remove(x)
-
-
#GET TERTIARY HEADLINES
h3=content
h3s=[]
@@ -86,32 +201,9 @@ def buildNYT():
x=h3.split('"', 1)[0]
if (h1 not in x) and (x not in h3s):
h3s.append(x)
- #REMOVE DUPLICATES
- removeArr=[]
- for i in range(len(h3s)):
- for j in range(len(h3s)):
- if i==j:
- continue
- else:
- if h3s[i] in h3s[j]:
- removeArr.append(h3s[j])
- for x in removeArr:
- h3s.remove(x)
-
-
- #BUILD THE ARTICLES BASED ON URLS
- h1Arr=[]
- h1Arr.append(buildArticle(h1, 'New York Times'))
-
- h2Arr=[]
- for x in h2s:
- h2Arr.append(buildArticle(x, 'New York Times'))
-
- h3Arr=[]
- for x in h3s:
- h3Arr.append(buildArticle(x, 'New York Times'))
- nyt=NewsSource2('New York Times', 'http://nytimes.com', h1Arr, h2Arr, h3Arr)
+ h1s, h2s, h3s = removeDuplicates([h1], h2s, h3s)
+ nyt=buildNewsSource2(name, url, h1s, h2s, h3s)
return nyt
@@ -119,6 +211,7 @@ def buildNYT():
'''
+NYT
EXAMPLE OF BIG HEADLINE SPANNING BOTH A AND B COLUMNS
<div class="span-ab-layout layout">