From b544a59cb96193ddcd0b8c0f9cc70bda973415a5 Mon Sep 17 00:00:00 2001 From: ssstvinc2 Date: Sun, 19 Feb 2017 23:04:31 -0500 Subject: Fixed bounding box on h1s --- html_template/BAKtemplate.html | 236 +++--- html_template/BAKunbiased.css | 206 ++--- html_template/Penguins.jpg | Bin html_template/newtemplate.html | 300 ++++---- html_template/template.html | 18 +- html_template/unbiased.css | 7 +- main.py | 0 parser.py | 1610 ++++++++++++++++++++-------------------- 8 files changed, 1194 insertions(+), 1183 deletions(-) mode change 100644 => 100755 html_template/BAKtemplate.html mode change 100644 => 100755 html_template/BAKunbiased.css mode change 100644 => 100755 html_template/Penguins.jpg mode change 100644 => 100755 html_template/template.html mode change 100644 => 100755 html_template/unbiased.css mode change 100644 => 100755 main.py mode change 100644 => 100755 parser.py diff --git a/html_template/BAKtemplate.html b/html_template/BAKtemplate.html old mode 100644 new mode 100755 index ab1dbb9..94a3796 --- a/html_template/BAKtemplate.html +++ b/html_template/BAKtemplate.html @@ -1,118 +1,118 @@ - - - - - - - UnBiased - - - - - -
-
-
- -
- -
-
xxTitle1-1xx
-
-
xxDesc1-1xx
-
- -
- -
- -
-
xxTitle1-2xx
-
-
xxDesc1-2xx
-
- -
- -
- -
-
xxTitle1-3xx
-
-
xxDesc1-3xx
-
-
- - - -
-
- -
- -
- Sources: xxSourcesxx -
- - + + + + + + + UnBiased + + + + + +
+
+
+ +
+ +
+
xxTitle1-1xx
+
+
xxDesc1-1xx
+
+ +
+ +
+ +
+
xxTitle1-2xx
+
+
xxDesc1-2xx
+
+ +
+ +
+ +
+
xxTitle1-3xx
+
+
xxDesc1-3xx
+
+
+ + + +
+
+ +
+ +
+ Sources: xxSourcesxx +
+ + diff --git a/html_template/BAKunbiased.css b/html_template/BAKunbiased.css old mode 100644 new mode 100755 index 49b6dce..ade390b --- a/html_template/BAKunbiased.css +++ b/html_template/BAKunbiased.css @@ -1,104 +1,104 @@ -a:link, a:visited, a:hover, a:active { - color: #00f; - text-decoration:none; - } - -#page-header{ - text-align:center; - padding:.5em 0 1em; - margin-bottom:1em; - border-bottom:1px solid #000; -} - -.title{ - font-size:3em; -} - -#title-1{ - font-style:italic; - color:#d00; -} - -#title-2{ - color:#00d; -} - -#subtitle{ - font-size:1.25em; -} - -#timestamp{ - margin:.5em 0 0 0; - font-size:.8em; -} - -#page-container{ - width:1150px; - padding:0 1em; - margin-left:auto; - margin-right:auto; -} - -#top-stories{ - width:1150px; - margin-left:auto; - margin-right:auto; - font-size:1.25em; -} - -.top-story{ - width:350px; - float:left; - margin:0 .5em; -} - -.top-stories-img{ - width:350px; - height:200px; - overflow:hidden; -} - -.top-stories-img img{ - width:100%; - display:block; - vertical-align:text-bottom; -} - -.top-stories-desc{ - font-size:.8em; - padding-top:.5em; -} - -#middle-stories{ - clear:both; - width:1000px; - margin:0 auto; -} - -.middle-story{ - margin:2em 5px; - width:45%; - float:left; - height:100px; -} - -.middle-story img{ - vertical-align:middle; - height:100px; - float:left; - margin-right:1em; -} - -.middle-stories-hed{ - font-size:1.1em; -} - -.middle-story p{ - display:block; -} - -#sources{ - clear:both; - padding-top:4em; - font-size:.8em; +a:link, a:visited, a:hover, a:active { + color: #00f; + text-decoration:none; + } + +#page-header{ + text-align:center; + padding:.5em 0 1em; + margin-bottom:1em; + border-bottom:1px solid #000; +} + +.title{ + font-size:3em; +} + +#title-1{ + font-style:italic; + color:#d00; +} + +#title-2{ + color:#00d; +} + +#subtitle{ + font-size:1.25em; +} + +#timestamp{ + margin:.5em 0 0 0; + font-size:.8em; +} + +#page-container{ + width:1150px; + padding:0 1em; + margin-left:auto; + margin-right:auto; +} + +#top-stories{ + width:1150px; + margin-left:auto; + margin-right:auto; + font-size:1.25em; +} + +.top-story{ + width:350px; + float:left; + margin:0 .5em; +} + +.top-stories-img{ + width:350px; + height:200px; + overflow:hidden; +} + +.top-stories-img img{ + width:100%; + display:block; + vertical-align:text-bottom; +} + +.top-stories-desc{ + font-size:.8em; + padding-top:.5em; +} + +#middle-stories{ + clear:both; + width:1000px; + margin:0 auto; +} + +.middle-story{ + margin:2em 5px; + width:45%; + float:left; + height:100px; +} + +.middle-story img{ + vertical-align:middle; + height:100px; + float:left; + margin-right:1em; +} + +.middle-stories-hed{ + font-size:1.1em; +} + +.middle-story p{ + display:block; +} + +#sources{ + clear:both; + padding-top:4em; + font-size:.8em; } \ No newline at end of file diff --git a/html_template/Penguins.jpg b/html_template/Penguins.jpg old mode 100644 new mode 100755 diff --git a/html_template/newtemplate.html b/html_template/newtemplate.html index 923dee2..0cec766 100644 --- a/html_template/newtemplate.html +++ b/html_template/newtemplate.html @@ -1,150 +1,150 @@ - - - - - - UnBiased - - - - - -
-
- -
- -
-
-
Rand Paul and Cory Booker push bipartisan effort to limit solitary confinement for juveniles
-
-
Sen. Rand Paul (R-Ky) and Sen …
-
- -
- -
-
-
Bibi and Donald
-
-
This week, Israel's prime minister will visit Washington and meet with our new president. They will have a complex agenda. Benjamin ...
-
- -
- -
-
-
David Oyelowo on How to Play a Real King
-
-
He stars in “A United Kingdom,” about the Botswana leader who married a white woman and set off an international crisis.
-
- -
- -
-
-
Judge orders Ohio village to pay back $3 million to lead-footed drivers
-
-
Speed cameras became a cash cow for the small village of New Miami, Ohio.
-
- -
- - - -
-
- xxTitle3-1xx -
- -
- xxTitle3-2xx -
- -
- xxTitle3-3xx -
- -
- xxTitle3-4xx -
- -
- xxTitle3-5xx -
- -
- xxTitle3-6xx -
- -
- xxTitle3-7xx -
- -
- xxTitle3-8xx -
-
- -
- -
- Sources: BBC US, NBC News, CBS News, The Blaze, Weekly Standard, New York Times, Fox News -
- - + + + + + + UnBiased + + + + + +
+
+ +
+ +
+
+
Rand Paul and Cory Booker push bipartisan effort to limit solitary confinement for juveniles
+
+
Sen. Rand Paul (R-Ky) and Sen …
+
+ +
+ +
+
+
Bibi and Donald
+
+
This week, Israel's prime minister will visit Washington and meet with our new president. They will have a complex agenda. Benjamin ...
+
+ +
+ +
+
+
David Oyelowo on How to Play a Real King
+
+
He stars in “A United Kingdom,” about the Botswana leader who married a white woman and set off an international crisis.
+
+ +
+ +
+
+
Judge orders Ohio village to pay back $3 million to lead-footed drivers
+
+
Speed cameras became a cash cow for the small village of New Miami, Ohio.
+
+ +
+ + + +
+
+ xxTitle3-1xx +
+ +
+ xxTitle3-2xx +
+ +
+ xxTitle3-3xx +
+ +
+ xxTitle3-4xx +
+ +
+ xxTitle3-5xx +
+ +
+ xxTitle3-6xx +
+ +
+ xxTitle3-7xx +
+ +
+ xxTitle3-8xx +
+
+ +
+ +
+ Sources: BBC US, NBC News, CBS News, The Blaze, Weekly Standard, New York Times, Fox News +
+ + diff --git a/html_template/template.html b/html_template/template.html old mode 100644 new mode 100755 index c0e0711..41eb86e --- a/html_template/template.html +++ b/html_template/template.html @@ -16,12 +16,12 @@
+
-
- -
-
-
xxTitle1-1xx
+
+ +
+
xxTitle1-1xx
xxDesc1-1xx
@@ -35,6 +35,10 @@
xxDesc1-2xx
+
+ +
+
- + +
+
diff --git a/html_template/unbiased.css b/html_template/unbiased.css old mode 100644 new mode 100755 index 126e194..c0bb121 --- a/html_template/unbiased.css +++ b/html_template/unbiased.css @@ -71,17 +71,22 @@ a:link, a:visited, a:hover, a:active { margin-bottom: 10px; } +.row{ + display:flex; +} + .top-story{ display:inline-block; vertical-align:top; text-align:left; width:360px; - height:352px; + height:auto; overflow:hidden; background:#fff; margin:10px; padding:10px; border:2px solid #ccc; + flex:1; } @media only screen and (max-width:500px){ diff --git a/main.py b/main.py old mode 100644 new mode 100755 diff --git a/parser.py b/parser.py old mode 100644 new mode 100755 index a537d48..2c22a87 --- a/parser.py +++ b/parser.py @@ -1,805 +1,805 @@ -#!/usr/bin/env python3 - -from unbiasedObjects import * -from unbiasedFunctions import buildArticle -import os -import re - - -''' -Takes in a URL, downloads the file to a temp file, -reads the file into a string, and returns that string -''' -def urlToContent(url): - #download file - os.system('wget -q -O scratch/temp1.html --no-check-certificate '+url) - - #read file - f=open('scratch/temp1.html', 'r')#, encoding="utf8") - content=f.read() - f.close() - - return content - - -''' -Creates a new newsSource2 object. For each URL in h1-h3URLs, -calls the file scraper and appends the new Article object. -Returns a newsSource2 object -''' -def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs): - h1Arr=[] - h1Arr.append(buildArticle(h1URLs[0], name)) - - h2Arr=[] - for x in h2URLs: - a=buildArticle(x, name) - if a!=None: - h2Arr.append(a) - - h3Arr=[] - for x in h3URLs: - a=buildArticle(x, name) - if a!=None: - h3Arr.append(a) - - #BUILD THE NEWS SOURCE - newsSource=NewsSource2(name, url, h1Arr, h2Arr, h3Arr) - - return newsSource - - -''' -Some sites will replicate URLs across the page. This function removes them. -Check hierarchically: if h3 exists in h1s or h2s, remove from h3s; -if h2 exists in h1s, remove from h2s - -also check partial URLs (e.g. nytimes.com/story.html is the same as -nytimes.com/story.html?var=x -''' -def removeDuplicates(h1s, h2s, h3s): - #Assume h1s is one element, and keep it - - #remove h2 duplicates - removeArr=[] - for i in range(len(h2s)): - #check internally - for j in range(len(h2s)): - if i==j: - continue - else: - if h2s[i] in h2s[j]: - removeArr.append(h2s[j]) - #check against h1s - for k in range(len(h1s)): - if (h2s[i] in h1s[k]) or (h1s[k] in h2s[i]): - removeArr.append(h2s[i]) - for x in removeArr: - h2s.remove(x) - - #remove h3 duplicates - removeArr=[] - for i in range(len(h3s)): - #check internally - for j in range(len(h3s)): - if i==j: - continue - else: - if h3s[i] in h3s[j]: - removeArr.append(h3s[j]) - #check against h1s and h2s - h1and2=h1s+h2s - for k in range(len(h1and2)): - if (h3s[i] in h1and2[k]) or (h1and2[k] in h3s[i]): - removeArr.append(h3s[i]) - for x in removeArr: - h3s.remove(x) - - - return h1s, h2s, h3s - - - -def removalNotification(source, title, reason, value): - print('*************************') - print('\t\tSTORY REMOVED') - print('SOURCE: '+source) - print('TITLE: \t'+title) - print('REASON: '+reason) - print('VALUE: \t'+value) - print('*************************\n\n') - - -def removeBadStories(source, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr=None): - - arr=[source.h1Arr, source.h2Arr, source.h3Arr] - - if badTitleArr!=None: - for i in range(len(arr)): - for hed in arr[i]: - for item in badTitleArr: - if item in hed.title: - arr[i].remove(hed) - #if it's in the h1 slot, bump up the - # first h2 into the h1 slot - if i==0: - arr[0].append(arr[1][0]) - arr[1].remove(arr[1][0]) - removalNotification(source.name, hed.title, 'Title', item) - - - if badDescArr!=None: - for i in range(len(arr)): - for hed in arr[i]: - for item in badDescArr: - if item in hed.description: - arr[i].remove(hed) - #if it's in the h1 slot, bump up the - # first h2 into the h1 slot - if i==0: - arr[0].append(arr[1][0]) - arr[1].remove(arr[1][0]) - removalNotification(source.name, hed.title, 'Description', item) - - - if badAuthorArr!=None: - for i in range(len(arr)): - for hed in arr[i]: - for item in badAuthorArr: - if item in hed.author: - arr[i].remove(hed) - #if it's in the h1 slot, bump up the - # first h2 into the h1 slot - if i==0: - arr[0].append(arr[1][0]) - arr[1].remove(arr[1][0]) - removalNotification(source.name, hed.title, 'Author', item) - - - if badImgArr!=None: - for i in range(len(arr)): - for hed in arr[i]: - for item in badImgArr: - if item in hed.img: - arr[i].remove(hed) - #if it's in the h1 slot, bump up the - # first h2 into the h1 slot - if i==0: - arr[0].append(arr[1][0]) - arr[1].remove(arr[1][0]) - removalNotification(source.name, hed.title, 'Image', item) - - if badURLArr!=None: - for i in range(len(arr)): - for hed in arr[i]: - for item in badURLArr: - if item in hed.url: - arr[i].remove(hed) - #if it's in the h1 slot, bump up the - # first h2 into the h1 slot - if i==0: - arr[0].append(arr[1][0]) - arr[1].remove(arr[1][0]) - removalNotification(source.name, hed.title, 'URL', item) - - return source - - - - -def buildTheHill(): - url='http://thehill.com' - name='The Hill' - - #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) - - #get main headline - h1=content - h1=h1.split('
', 1)[1] - h1=h1.split('', 1)[1] - h2=h2.split('', 1)[0] - while '
', 1)[1] - h3=h3.split('', 1)[0] - while '
', 3)[2:] - for x in h2: - x=x.split('

', 1)[1] - h3=h3.split('', 1)[0] - while '
  • ' in h2: - h2=h2.split('
  • ', 1)[1] - h2=h2.split('', 1)[1] - h2=h2.split('
    ', 1)[1] - h3=h3.split('Watch/Listen', 1)[0] - while '
    ', 1)[1] - h1=h1.split('href="', 1)[1] - h1=h1.split('"', 1)[0] - h1s=[h1] - - #GET SECONDARY HEADLINES - h2=content - h2s=[] - h2=h2.split('
    ', 1)[1] - h2=h2.split('
    ' in h2: - h2=h2.split('
    ', 1)[1] - h2=h2.split('href="', 1)[1] - x=h2.split('"', 1)[0] - if h1 not in x: - h2s.append(x) - - #GET TERTIARY HEADLINES - h3=content - h3s=[] - h3=h3.split('Today\'s Standard', 1)[1] - h3=h3.split('
    ' in h3: - h3=h3.split('
    ', 1)[1] - h3=h3.split('href="', 1)[1] - x=h3.split('"', 1)[0] - if h1 not in x: - h3s.append(x) - - #Need to add URL prefix to all URLs - for i in range(len(h1s)): - h1s[i]=url+h1s[i] - for i in range(len(h2s)): - h2s[i]=url+h2s[i] - for i in range(len(h3s)): - h3s[i]=url+h3s[i] - - - h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - wkl=buildNewsSource2(name, url, h1s, h2s, h3s) - - #REMOVE BAD STORIES - badTitleArr=None - ## if flagged again, remove Micah Mattix - badDescArr=['Matt Labash'] - badAuthorArr=['MATT LABASH', 'TWS PODCAST', 'ERIC FELTEN', 'Steven J. Lenzner', 'MARK HEMINGWAY'] - badImgArr=['http://www.weeklystandard.com/s3/tws15/images/twitter/tws-twitter_1024x512.png'] - wkl=removeBadStories(wkl, badTitleArr, badDescArr, badAuthorArr, badImgArr) - - return wkl - - - - -def buildNPR(): - url='http://www.npr.org/sections/news/' - name='NPR' - - #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) - - #get main headline - h1=content - h1=h1.split('', 1)[1] - h1=h1.split('', 1)[1] - h2=h2.split('', 1)[0] - while '
    ' in h2: - h2=h2.split('
    ', 1)[1] - h2=h2.split('', 1)[1] - h2=h2.split('
    ', 1)[0] - while '', 1)[1] - h1=h1.split('', 1)[1] - h2=h2.split('', 1)[0] - #remove "collection" sets - while '
    ' in h2: - arr=h2.split('
    ', 1) - h2=arr[0]+arr[1].split('', 1)[1] - #Grab the remaining URLs - while '', 1)[1] - h2=h2.split('', 1)[0] - #remove "collection" sets - while '
    ' in h2: - arr=h2.split('
    ', 1) - h2=arr[0]+arr[1].split('', 1)[1] - #Grab the remaining URLs - while '', 1)[1] - h3=h3.split('', 1)[0] - #remove "collection" sets - while '
    ' in h2: - arr=h3.split('
    ', 1) - h3=arr[0]+arr[1].split('', 1)[1] - - #Grab the remaining URLs - while ' - -
    - -
    -

    Top News

    - -
    - - -''' +#!/usr/bin/env python3 + +from unbiasedObjects import * +from unbiasedFunctions import buildArticle +import os +import re + + +''' +Takes in a URL, downloads the file to a temp file, +reads the file into a string, and returns that string +''' +def urlToContent(url): + #download file + os.system('wget -q -O scratch/temp1.html --no-check-certificate '+url) + + #read file + f=open('scratch/temp1.html', 'r')#, encoding="utf8") + content=f.read() + f.close() + + return content + + +''' +Creates a new newsSource2 object. For each URL in h1-h3URLs, +calls the file scraper and appends the new Article object. +Returns a newsSource2 object +''' +def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs): + h1Arr=[] + h1Arr.append(buildArticle(h1URLs[0], name)) + + h2Arr=[] + for x in h2URLs: + a=buildArticle(x, name) + if a!=None: + h2Arr.append(a) + + h3Arr=[] + for x in h3URLs: + a=buildArticle(x, name) + if a!=None: + h3Arr.append(a) + + #BUILD THE NEWS SOURCE + newsSource=NewsSource2(name, url, h1Arr, h2Arr, h3Arr) + + return newsSource + + +''' +Some sites will replicate URLs across the page. This function removes them. +Check hierarchically: if h3 exists in h1s or h2s, remove from h3s; +if h2 exists in h1s, remove from h2s + +also check partial URLs (e.g. nytimes.com/story.html is the same as +nytimes.com/story.html?var=x +''' +def removeDuplicates(h1s, h2s, h3s): + #Assume h1s is one element, and keep it + + #remove h2 duplicates + removeArr=[] + for i in range(len(h2s)): + #check internally + for j in range(len(h2s)): + if i==j: + continue + else: + if h2s[i] in h2s[j]: + removeArr.append(h2s[j]) + #check against h1s + for k in range(len(h1s)): + if (h2s[i] in h1s[k]) or (h1s[k] in h2s[i]): + removeArr.append(h2s[i]) + for x in removeArr: + h2s.remove(x) + + #remove h3 duplicates + removeArr=[] + for i in range(len(h3s)): + #check internally + for j in range(len(h3s)): + if i==j: + continue + else: + if h3s[i] in h3s[j]: + removeArr.append(h3s[j]) + #check against h1s and h2s + h1and2=h1s+h2s + for k in range(len(h1and2)): + if (h3s[i] in h1and2[k]) or (h1and2[k] in h3s[i]): + removeArr.append(h3s[i]) + for x in removeArr: + h3s.remove(x) + + + return h1s, h2s, h3s + + + +def removalNotification(source, title, reason, value): + print('*************************') + print('\t\tSTORY REMOVED') + print('SOURCE: '+source) + print('TITLE: \t'+title) + print('REASON: '+reason) + print('VALUE: \t'+value) + print('*************************\n\n') + + +def removeBadStories(source, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr=None): + + arr=[source.h1Arr, source.h2Arr, source.h3Arr] + + if badTitleArr!=None: + for i in range(len(arr)): + for hed in arr[i]: + for item in badTitleArr: + if item in hed.title: + arr[i].remove(hed) + #if it's in the h1 slot, bump up the + # first h2 into the h1 slot + if i==0: + arr[0].append(arr[1][0]) + arr[1].remove(arr[1][0]) + removalNotification(source.name, hed.title, 'Title', item) + + + if badDescArr!=None: + for i in range(len(arr)): + for hed in arr[i]: + for item in badDescArr: + if item in hed.description: + arr[i].remove(hed) + #if it's in the h1 slot, bump up the + # first h2 into the h1 slot + if i==0: + arr[0].append(arr[1][0]) + arr[1].remove(arr[1][0]) + removalNotification(source.name, hed.title, 'Description', item) + + + if badAuthorArr!=None: + for i in range(len(arr)): + for hed in arr[i]: + for item in badAuthorArr: + if item in hed.author: + arr[i].remove(hed) + #if it's in the h1 slot, bump up the + # first h2 into the h1 slot + if i==0: + arr[0].append(arr[1][0]) + arr[1].remove(arr[1][0]) + removalNotification(source.name, hed.title, 'Author', item) + + + if badImgArr!=None: + for i in range(len(arr)): + for hed in arr[i]: + for item in badImgArr: + if item in hed.img: + arr[i].remove(hed) + #if it's in the h1 slot, bump up the + # first h2 into the h1 slot + if i==0: + arr[0].append(arr[1][0]) + arr[1].remove(arr[1][0]) + removalNotification(source.name, hed.title, 'Image', item) + + if badURLArr!=None: + for i in range(len(arr)): + for hed in arr[i]: + for item in badURLArr: + if item in hed.url: + arr[i].remove(hed) + #if it's in the h1 slot, bump up the + # first h2 into the h1 slot + if i==0: + arr[0].append(arr[1][0]) + arr[1].remove(arr[1][0]) + removalNotification(source.name, hed.title, 'URL', item) + + return source + + + + +def buildTheHill(): + url='http://thehill.com' + name='The Hill' + + #DOWNLOAD HOMEPAGE CONTENT + content=urlToContent(url) + + #get main headline + h1=content + h1=h1.split('
    ', 1)[1] + h1=h1.split('', 1)[1] + h2=h2.split('', 1)[0] + while '
    ', 1)[1] + h3=h3.split('', 1)[0] + while '
    ', 3)[2:] + for x in h2: + x=x.split('

    ', 1)[1] + h3=h3.split('', 1)[0] + while '
  • ' in h2: + h2=h2.split('
  • ', 1)[1] + h2=h2.split('', 1)[1] + h2=h2.split('
    ', 1)[1] + h3=h3.split('Watch/Listen', 1)[0] + while '
    ', 1)[1] + h1=h1.split('href="', 1)[1] + h1=h1.split('"', 1)[0] + h1s=[h1] + + #GET SECONDARY HEADLINES + h2=content + h2s=[] + h2=h2.split('
    ', 1)[1] + h2=h2.split('
    ' in h2: + h2=h2.split('
    ', 1)[1] + h2=h2.split('href="', 1)[1] + x=h2.split('"', 1)[0] + if h1 not in x: + h2s.append(x) + + #GET TERTIARY HEADLINES + h3=content + h3s=[] + h3=h3.split('Today\'s Standard', 1)[1] + h3=h3.split('
    ' in h3: + h3=h3.split('
    ', 1)[1] + h3=h3.split('href="', 1)[1] + x=h3.split('"', 1)[0] + if h1 not in x: + h3s.append(x) + + #Need to add URL prefix to all URLs + for i in range(len(h1s)): + h1s[i]=url+h1s[i] + for i in range(len(h2s)): + h2s[i]=url+h2s[i] + for i in range(len(h3s)): + h3s[i]=url+h3s[i] + + + h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) + wkl=buildNewsSource2(name, url, h1s, h2s, h3s) + + #REMOVE BAD STORIES + badTitleArr=None + ## if flagged again, remove Micah Mattix + badDescArr=['Matt Labash'] + badAuthorArr=['MATT LABASH', 'TWS PODCAST', 'ERIC FELTEN', 'Steven J. Lenzner', 'MARK HEMINGWAY'] + badImgArr=['http://www.weeklystandard.com/s3/tws15/images/twitter/tws-twitter_1024x512.png'] + wkl=removeBadStories(wkl, badTitleArr, badDescArr, badAuthorArr, badImgArr) + + return wkl + + + + +def buildNPR(): + url='http://www.npr.org/sections/news/' + name='NPR' + + #DOWNLOAD HOMEPAGE CONTENT + content=urlToContent(url) + + #get main headline + h1=content + h1=h1.split('', 1)[1] + h1=h1.split('', 1)[1] + h2=h2.split('', 1)[0] + while '
    ' in h2: + h2=h2.split('
    ', 1)[1] + h2=h2.split('', 1)[1] + h2=h2.split('
    ', 1)[0] + while '', 1)[1] + h1=h1.split('', 1)[1] + h2=h2.split('', 1)[0] + #remove "collection" sets + while '