diff options
author | sstvinc2 <sstvinc2@gmail.com> | 2017-02-14 21:48:10 -0600 |
---|---|---|
committer | sstvinc2 <sstvinc2@gmail.com> | 2017-02-14 21:48:10 -0600 |
commit | c0a52698826fba2aeb5c2889f3856f051db1052c (patch) | |
tree | b26190c77ad99a5400c7fa0f64d29537b90bee53 | |
parent | 7ceea6a5a495302ffdec9921ea9f841a2b6df8c2 (diff) |
modularized code a bit, and added Fox News with new parser
-rw-r--r-- | main.py | 27 | ||||
-rw-r--r-- | parser.py | 175 |
2 files changed, 136 insertions, 66 deletions
@@ -71,31 +71,8 @@ def run(): nyt=buildNYT() sourceList.append(nyt) - ''' - sourceList.append(NewsSource('New York Times', - 'http://nytimes.com', - ['<a href="'], - ['<article class="story theme-summary"', '<a href="'], - ['<hr class="single-rule"', 'article class="story theme-summary', 'h2 class="story-heading"><a href="'], - '<div class="b-column column">', '<!-- close photo-spot-region -->', - 'section id="top-news" class="top-news"', '</div><!-- close a-column -->', - 'class="second-column-region region"', 'html.geo-dma-501 .nythpNYRegionPromo')) - ''' - - - - - sourceList.append(NewsSource('Fox News', - 'http://foxnews.com', - ['<h1><a href="'], - ['<li data-vr-contentbox=""><a href="'], - [], - None, None, - '<div class="top-stories">', '<section id="latest"', - None, None)) - - - + fox=buildFoxNews() + sourceList.append(fox) #scrape all urls and build data structure newsSourceArr=buildNewsSourceArr(sourceList) @@ -4,9 +4,12 @@ from unbiasedObjects import * from unbiasedFunctions import buildArticle import os -def buildNYT(): - url='http://www.nytimes.com' +''' +Takes in a URL, downloads the file to a temp file, +reads the file into a string, and returns that string +''' +def urlToContent(url): #download file os.system('wget -q -O scratch/temp1.html --no-check-certificate '+url) @@ -15,6 +18,131 @@ def buildNYT(): content=f.read() f.close() + return content + + +''' +Creates a new newsSource2 object. For each URL in h1-h3URLs, +calls the file scraper and appends the new Article object. +Returns a newsSource2 object +''' +def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs): + h1Arr=[] + h1Arr.append(buildArticle(h1URLs[0], name)) + + h2Arr=[] + for x in h2URLs: + h2Arr.append(buildArticle(x, name)) + + h3Arr=[] + for x in h3URLs: + h3Arr.append(buildArticle(x, name)) + + #BUILD THE NEWS SOURCE + newsSource=NewsSource2(name, url, h1Arr, h2Arr, h3Arr) + + return newsSource + + +''' +Some sites will replicate URLs across the page. This function removes them. +Check hierarchically: if h3 exists in h1s or h2s, remove from h3s; +if h2 exists in h1s, remove from h2s + +also check partial URLs (e.g. nytimes.com/story.html is the same as +nytimes.com/story.html?var=x +''' +def removeDuplicates(h1s, h2s, h3s): + #Assume h1s is one element, and keep it + + #remove h2 duplicates + removeArr=[] + for i in range(len(h2s)): + #check internally + for j in range(len(h2s)): + if i==j: + continue + else: + if h2s[i] in h2s[j]: + removeArr.append(h2s[j]) + #check against h1s + for k in range(len(h1s)): + if (h2s[i] in h1s[k]) or (h1s[k] in h2s[i]): + removeArr.append(h2s[i]) + for x in removeArr: + h2s.remove(x) + + #remove h3 duplicates + removeArr=[] + for i in range(len(h3s)): + #check internally + for j in range(len(h3s)): + if i==j: + continue + else: + if h3s[i] in h3s[j]: + removeArr.append(h3s[j]) + #check against h1s and h2s + h1and2=h1s+h2s + for k in range(len(h1and2)): + if (h3s[i] in h1and2[k]) or (h1and2[k] in h3s[i]): + removeArr.append(h3s[i]) + for x in removeArr: + h3s.remove(x) + + + return h1s, h2s, h3s + + +def buildFoxNews(): + url='http://foxnews.com' + name='Fox News' + + #DOWNLOAD HOMEPAGE CONTENT + content=urlToContent(url) + + #get main headline + h1=content + h1=h1.split('<h1><a href="', 1)[1] + h1=h1.split('"', 1)[0] + h1s=[h1] + + #GET SECONDARY HEADLINES + h2=content + h2s=[] + h2=h2.split('<div class="top-stories">', 1)[1] + h2=h2.split('<section id="latest"', 1)[0] + while '<li data-vr-contentbox=""><a href="' in h2: + h2=h2.split('<li data-vr-contentbox=""><a href="', 1)[1] + x=h2.split('"', 1)[0] + if h1 not in x: + h2s.append(x) + + #GET TERTIARY HEADLINES + h3=content + h3s=[] + h3=h3.split('div id="big-top"', 1)[1] + h3=h3.split('<div class="top-stories">', 1)[0] + while '<a href="' in h3: + h3=h3.split('<a href="', 1)[1] + x=h3.split('"', 1)[0] + if h1 not in x: + h3s.append(x) + + h1s, h2s, h3s = removeDuplicates([h1], h2s, h3s) + fox=buildNewsSource2(name, url, h1s, h2s, h3s) + + return fox + + + +def buildNYT(): + url='http://www.nytimes.com' + name='New York Times' + + #DOWNLOAD HOMEPAGE CONTENT + content=urlToContent(url) + #get main headline #this will likely need if/else logic h1=content @@ -24,7 +152,7 @@ def buildNYT(): h1=h1.split('<a href="', 1)[1] h1=h1.split('"', 1)[0] - #GET SECONARY HEADLINES + #GET SECONDARY HEADLINES #This comes from the a column or b column, above the break h2=content h2s=[] @@ -57,19 +185,6 @@ def buildNYT(): if (h1 not in x) and (x not in h2s): h2s.append(x) - #REMOVE DUPLICATES - removeArr=[] - for i in range(len(h2s)): - for j in range(len(h2s)): - if i==j: - continue - else: - if h2s[i] in h2s[j]: - removeArr.append(h2s[j]) - for x in removeArr: - h2s.remove(x) - - #GET TERTIARY HEADLINES h3=content h3s=[] @@ -86,32 +201,9 @@ def buildNYT(): x=h3.split('"', 1)[0] if (h1 not in x) and (x not in h3s): h3s.append(x) - #REMOVE DUPLICATES - removeArr=[] - for i in range(len(h3s)): - for j in range(len(h3s)): - if i==j: - continue - else: - if h3s[i] in h3s[j]: - removeArr.append(h3s[j]) - for x in removeArr: - h3s.remove(x) - - - #BUILD THE ARTICLES BASED ON URLS - h1Arr=[] - h1Arr.append(buildArticle(h1, 'New York Times')) - - h2Arr=[] - for x in h2s: - h2Arr.append(buildArticle(x, 'New York Times')) - - h3Arr=[] - for x in h3s: - h3Arr.append(buildArticle(x, 'New York Times')) - nyt=NewsSource2('New York Times', 'http://nytimes.com', h1Arr, h2Arr, h3Arr) + h1s, h2s, h3s = removeDuplicates([h1], h2s, h3s) + nyt=buildNewsSource2(name, url, h1s, h2s, h3s) return nyt @@ -119,6 +211,7 @@ def buildNYT(): ''' +NYT EXAMPLE OF BIG HEADLINE SPANNING BOTH A AND B COLUMNS <div class="span-ab-layout layout"> |