From e690fdfa6f1eebac5a4790668ab946e82f947eaf Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Sun, 16 Apr 2017 16:59:02 -0400 Subject: take webroot as a command line argument --- main.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/main.py b/main.py index a109d2f..f1c3317 100755 --- a/main.py +++ b/main.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 +import argparse +import os + from unbiasedObjects import * from unbiasedFunctions import * from parser import * @@ -7,13 +10,17 @@ import time def main(): + parser = argparse.ArgumentParser() + parser.add_argument('-w', '--webroot', default='/var/www/ubiased', help='location to write the output html') + args = parser.parse_args() + while True: print('-----------------------') - run() + run(args.webroot) print('-----------------------') time.sleep(600) -def run(): +def run(webroot): sourceList=[] ''' @@ -25,6 +32,8 @@ def run(): ''' + print('running with webroot="{}"'.format(webroot)) + ### These values have to be the second half of the function name ### E.g. Guardian calls buildGuardian(), etc. @@ -54,7 +63,7 @@ def run(): outputHTML=buildOutput(newsSourceArr) #print the output file HTML - printOutputHTML(outputHTML, '/var/www/html/index.html') + printOutputHTML(outputHTML, os.path.join(webroot, 'index.html')) if __name__=="__main__": -- cgit v1.2.3 From 4622a264b8d6e0446a52d96b7df220d357c082a9 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Sun, 16 Apr 2017 23:21:22 -0400 Subject: move files around for packaging reasons --- .gitignore | 4 +- html_template/newtemplate.html | 150 ----- html_template/template.html | 173 ------ html_template/unbiased.css | 220 ------- main.py | 70 --- parser.py | 986 -------------------------------- scratch/do_not_delete | 0 spotCheck.py | 41 -- unbiased/html_template/newtemplate.html | 150 +++++ unbiased/html_template/template.html | 173 ++++++ unbiased/html_template/unbiased.css | 220 +++++++ unbiased/main.py | 70 +++ unbiased/parser.py | 986 ++++++++++++++++++++++++++++++++ unbiased/scratch/do_not_delete | 0 unbiased/spotCheck.py | 41 ++ unbiased/unbiasedFunctions.py | 259 +++++++++ unbiased/unbiasedObjects.py | 90 +++ unbiasedFunctions.py | 259 --------- unbiasedObjects.py | 90 --- 19 files changed, 1991 insertions(+), 1991 deletions(-) delete mode 100644 html_template/newtemplate.html delete mode 100755 html_template/template.html delete mode 100755 html_template/unbiased.css delete mode 100755 main.py delete mode 100755 parser.py delete mode 100644 scratch/do_not_delete delete mode 100755 spotCheck.py create mode 100644 unbiased/html_template/newtemplate.html create mode 100755 unbiased/html_template/template.html create mode 100755 unbiased/html_template/unbiased.css create mode 100755 unbiased/main.py create mode 100755 unbiased/parser.py create mode 100644 unbiased/scratch/do_not_delete create mode 100755 unbiased/spotCheck.py create mode 100644 unbiased/unbiasedFunctions.py create mode 100644 unbiased/unbiasedObjects.py delete mode 100644 unbiasedFunctions.py delete mode 100644 unbiasedObjects.py diff --git a/.gitignore b/.gitignore index 65c8f8e..90bf98d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,10 @@ *.pyc *~ __pycache__/ -scratch/*.html +unbiased/scratch/*.html legacy_py/ unbiased.html html_template/Penguins.jpg html_template/BAK* #* -.#* \ No newline at end of file +.#* diff --git a/html_template/newtemplate.html b/html_template/newtemplate.html deleted file mode 100644 index 0cec766..0000000 --- a/html_template/newtemplate.html +++ /dev/null @@ -1,150 +0,0 @@ - - - - - - UnBiased - - - - - - - -
- Sources: BBC US, NBC News, CBS News, The Blaze, Weekly Standard, New York Times, Fox News -
- - diff --git a/html_template/template.html b/html_template/template.html deleted file mode 100755 index fc17006..0000000 --- a/html_template/template.html +++ /dev/null @@ -1,173 +0,0 @@ - - - - - - - UnBiased - - - - - - - -
- Sources: xxSourcesxx -
- - diff --git a/html_template/unbiased.css b/html_template/unbiased.css deleted file mode 100755 index 244f100..0000000 --- a/html_template/unbiased.css +++ /dev/null @@ -1,220 +0,0 @@ -/*body{ - width:900px; - margin-left:auto; - margin-right:auto; -}*/ - - -body{ - margin:0; -} - -a:link, a:visited, a:hover, a:active { - color: #00f; - text-decoration:none; - } - -a:hover{ - cursor:pointer; -} - -#page-header{ - width:100%; - text-align:center; - padding:.5em 0 1em; - margin-bottom:1em; - border-bottom:3px solid #BB133E; - background:#002147; -} - -.title{ - font-size:3em; -} - -#title-1{ - font-style:italic; - color:#fff; -} - -#title-2{ - color:#fff; -} - -#subtitle{ - font-size:1.25em; - color:#ccc; -} - -#timestamp{ - margin:.5em 0 0 0; - font-size:.8em; - color:#cc6; -} - -#page-container{ - width:900px; - margin-left:auto; - margin-right:auto; -} - -@media only screen and (max-width:900px){ - #page-container{ - width:100% - } -} - -#top-stories{ - width:95%; - display:block; - overflow:auto; - padding:10px; - margin-left:auto; - margin-right:auto; - text-align:center; - border-bottom: 3px solid #BB133E; - margin-bottom: 10px; -} - -.row{ - display:flex; -} - -.top-story{ - display:inline-block; - vertical-align:top; - text-align:left; - width:360px; - height:auto; - overflow:hidden; - background:#fff; - margin:10px; - padding:10px; - border:2px solid #ccc; - flex:1; -} - -@media only screen and (max-width:500px){ - .row{ - display:block; - } - .top-story{ - display:block; - width:auto; - height:auto; - } -} - -.top-stories-img{ - width:350px; - height:200px; - overflow:hidden; - background-size: auto 234px;/*cover;*/ - background-position: top center;/*center center;*/ - margin:0 auto; -} - -@media only screen and (max-width:500px){ - .top-stories-img{ - width:auto; - } -} - - -.top-stories-hed{ - font-weight:bold; - font-size:1.35em; - margin:10px 10px 0; - color:#00f; -} - -.top-stories-desc{ - font-size:1em; - padding-top:.5em; - margin:0 .75em; -} - -#middle-stories{ - clear:both; - width:500px; - margin:0 auto; - padding:0; - display:block; - overflow:auto; - float:left; -} - -@media only screen and (max-width:500px){ - #middle-stories{ - width:100%; - float:none; - } -} - -.middle-story{ - margin:5px 10px; - padding:10px; - background:#fff; - border:2px solid #ddd; - width:460px; - float:left; -} - -@media only screen and (max-width:500px){ - .middle-story{ - width:auto; - } -} - -.middle-stories-img{ - width:150px; - height:100px; - overflow:hidden; - background-size: auto 117px;/*cover;*/ - background-position: top center;/*center center;*/ - float:left; - max-width:35%; -} - -.middle-stories-hed{ - font-size:1.2em; - float:left; - width:300px; - margin-left:10px; - color:#00f; -} - -@media only screen and (max-width:500px){ - .middle-stories-hed{ - max-width:60%; - } -} - -#bottom-stories{ - margin:0 10px; - padding:10px; - display:block; - overflow:auto; - float:left; - width:350px; - border:5px solid #ddd; -} - -@media only screen and (max-width:900px){ - #bottom-stories{ - width:auto; - border-width:3px; - float:none; - } -} - -.bottom-story{ color:#00f; - - padding:15px 0; - color:#00f; -} - -#sources{ - clear:both; - padding-top:4em; - font-size:.8em; -} \ No newline at end of file diff --git a/main.py b/main.py deleted file mode 100755 index f1c3317..0000000 --- a/main.py +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import os - -from unbiasedObjects import * -from unbiasedFunctions import * -from parser import * -import time - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument('-w', '--webroot', default='/var/www/ubiased', help='location to write the output html') - args = parser.parse_args() - - while True: - print('-----------------------') - run(args.webroot) - print('-----------------------') - time.sleep(600) - -def run(webroot): - sourceList=[] - - ''' - - SOURCES TO ADD NEXT: - -ABC - -REUTERS - -Town Hall - - ''' - - print('running with webroot="{}"'.format(webroot)) - - - ### These values have to be the second half of the function name - ### E.g. Guardian calls buildGuardian(), etc. - sourceFnArr=['Guardian', 'TheHill', 'NPR', 'BBC', 'NBC', 'CBS', - 'FoxNews', 'WashTimes', 'CSM', 'ABC'] #'Blaze' - - for source in sourceFnArr: - tries=0 - while tries<3: - try: - fn='build'+source - possibles = globals().copy() - possibles.update(locals()) - method = possibles.get(fn) - src=method() - sourceList.append(src) - break - except: - print('Build error. Looping again: '+source) - tries+=1 - time.sleep(tries) - - #scrape all urls and build data structure - newsSourceArr=buildNewsSourceArr(sourceList) - - #build the output file HTML - outputHTML=buildOutput(newsSourceArr) - - #print the output file HTML - printOutputHTML(outputHTML, os.path.join(webroot, 'index.html')) - - -if __name__=="__main__": - main() diff --git a/parser.py b/parser.py deleted file mode 100755 index f69281b..0000000 --- a/parser.py +++ /dev/null @@ -1,986 +0,0 @@ -#!/usr/bin/env python3 - -from unbiasedObjects import * -from unbiasedFunctions import buildArticle -import os -import re - - -''' -Takes in a URL, downloads the file to a temp file, -reads the file into a string, and returns that string -''' -def urlToContent(url, sourceEncoding='utf8'): - #download file - os.system('wget -q -O scratch/temp1.html --no-check-certificate '+url) - - #read file - if sourceEncoding=='utf8': - f=open('scratch/temp1.html', 'r', encoding="utf8") - else: - f=open('scratch/temp1.html', 'r', encoding="latin-1") - content=f.read() - f.close() - - return content - - -''' -Creates a new newsSource2 object. For each URL in h1-h3URLs, -calls the file scraper and appends the new Article object. -Returns a newsSource2 object -''' -def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs): - h1Arr=[] - a=buildArticle(h1URLs[0], name) - if a==None: - print('................\nH1 Nonetype in '+name+'\n................') - else: - h1Arr.append(a) - - h2Arr=[] - for x in h2URLs: - a=buildArticle(x, name) - if a!=None: - h2Arr.append(a) - else: - print('................\nH2 Nonetype in '+name+'\n................') - - - h3Arr=[] - for x in h3URLs: - a=buildArticle(x, name) - if a!=None: - h3Arr.append(a) - else: - print('................\nH3 Nonetype in '+name+'\n................') - - #BUILD THE NEWS SOURCE - newsSource=NewsSource2(name, url, h1Arr, h2Arr, h3Arr) - - return newsSource - - -''' -Some sites will replicate URLs across the page. This function removes them. -Check hierarchically: if h3 exists in h1s or h2s, remove from h3s; -if h2 exists in h1s, remove from h2s - -also check partial URLs (e.g. nytimes.com/story.html is the same as -nytimes.com/story.html?var=x -''' -def removeDuplicates(h1s, h2s, h3s): - #Assume h1s is one element, and keep it - - #remove h2 duplicates - removeArr=[] - for i in range(len(h2s)): - #check internally - for j in range(len(h2s)): - if i==j: - continue - else: - if h2s[i] in h2s[j]: - removeArr.append(h2s[j]) - #check against h1s - for k in range(len(h1s)): - if (h2s[i] in h1s[k]) or (h1s[k] in h2s[i]): - removeArr.append(h2s[i]) - for x in removeArr: - h2s.remove(x) - - #remove h3 duplicates - removeArr=[] - for i in range(len(h3s)): - #check internally - for j in range(len(h3s)): - if i==j: - continue - else: - if h3s[i] in h3s[j]: - removeArr.append(h3s[j]) - #check against h1s and h2s - h1and2=h1s+h2s - for k in range(len(h1and2)): - if (h3s[i] in h1and2[k]) or (h1and2[k] in h3s[i]): - removeArr.append(h3s[i]) - for x in removeArr: - if x in h3s: - h3s.remove(x) - - - return h1s, h2s, h3s - - - -def removalNotification(source, title, reason, value): - print('*************************') - print('\t\tSTORY REMOVED') - print('SOURCE: '+source) - print('TITLE: \t'+title) - print('REASON: '+reason) - print('VALUE: \t'+value) - print('*************************\n\n') - - -def removeBadStoriesHelper(source, element, badStringList, arr): - if badStringList!=None: - for i in range(len(arr)): - for hed in arr[i]: - if hed==None: - print("////////\nNone type found in removeBadStoriesHelper for "+source.name+"\n/////////") - break - for item in badStringList: - if item in getattr(hed, element): - arr[i].remove(hed) - #if it's in the h1 slot, bump up the - # first h2 into the h1 slot - if i==0: - arr[0].append(arr[1][0]) - arr[1].remove(arr[1][0]) - removalNotification(source.name, hed.title, element, item) - - -def removeBadStories(source, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr=None): - - arr=[source.h1Arr, source.h2Arr, source.h3Arr] - - removeBadStoriesHelper(source, "title", badTitleArr, arr) - removeBadStoriesHelper(source, "description", badDescArr, arr) - removeBadStoriesHelper(source, "author", badAuthorArr, arr) - removeBadStoriesHelper(source, "img", badImgArr, arr) - removeBadStoriesHelper(source, "url", badURLArr, arr) - - return source - - - - -def buildTheHill(): - url='http://thehill.com' - name='The Hill' - - #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) - - #get main headline - h1=content - h1=h1.split('
', 1)[1] - h1=h1.split('', 1)[1] - h2=h2.split('', 1)[0] - while '
', 1)[1] - h3=h3.split('', 1)[0] - while '
')[2:] - for x in h2: - if '

', 1)[1] - h3=h3.split('
', 1)[0]#'', 1)[0] - while '
  • ' in h2: - h2=h2.split('
  • ', 1)[1] - h2=h2.split('', 1)[1] - h2=h2.split('
    ', 1)[1] - h3=h3.split('Watch/Listen', 1)[0] - while '
    ', 1)[1] - h1=h1.split('href="', 1)[1] - h1=h1.split('"', 1)[0] - h1s=[h1] - - #GET SECONDARY HEADLINES - h2=content - h2s=[] - h2=h2.split('
    ', 1)[1] - h2=h2.split('
    ' in h2: - h2=h2.split('
    ', 1)[1] - h2=h2.split('href="', 1)[1] - x=h2.split('"', 1)[0] - if h1 not in x: - h2s.append(x) - - #GET TERTIARY HEADLINES - h3=content - h3s=[] - h3=h3.split('Today\'s Standard', 1)[1] - h3=h3.split('
    ' in h3: - h3=h3.split('
    ', 1)[1] - h3=h3.split('href="', 1)[1] - x=h3.split('"', 1)[0] - if h1 not in x: - h3s.append(x) - - #Need to add URL prefix to all URLs - for i in range(len(h1s)): - h1s[i]=url+h1s[i] - for i in range(len(h2s)): - h2s[i]=url+h2s[i] - for i in range(len(h3s)): - h3s[i]=url+h3s[i] - - - h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - wkl=buildNewsSource2(name, url, h1s, h2s, h3s) - - #REMOVE BAD STORIES - badTitleArr=None - ## if flagged again, remove Micah Mattix - badDescArr=['Matt Labash'] - badAuthorArr=['MATT LABASH', 'TWS PODCAST', 'ERIC FELTEN', 'Steven J. Lenzner', 'MARK HEMINGWAY'] - badImgArr=['http://www.weeklystandard.com/s3/tws15/images/twitter/tws-twitter_1024x512.png'] - wkl=removeBadStories(wkl, badTitleArr, badDescArr, badAuthorArr, badImgArr) - - return wkl - - - - -def buildNPR(): - url='http://www.npr.org/sections/news/' - name='NPR' - - #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) - - #get main headline - h1=content - h1=h1.split('', 1)[1] - h1=h1.split('', 1)[1] - h2=h2.split('', 1)[0] - while '
    ' in h2: - h2=h2.split('
    ', 1)[1] - h2=h2.split('', 1)[1] - h2=h2.split('
    ', 1)[0] - while '', 1)[1] - h1=h1.split('
    ', 1)[1] - h2=h2.split('', 1)[0] - #remove "collection" sets - while '
    ' in h2: - arr=h2.split('
    ', 1) - h2=arr[0]+arr[1].split('', 1)[1] - #Grab the remaining URLs - while '', 1)[1] - h3=h3.split('', 1)[0] - #remove "collection" sets - while '
    ' in h3: - arr=h3.split('
    ', 1) - h3=arr[0]+arr[1].split('', 1)[1] - #Grab the remaining URLs - while '' in h3: - h3=h3.split('', 1)[1] - h3=h3.split('', 1)[0] - elif '/video/the-daily-360' in h3: - h3=h3.split('/video/the-daily-360')[-1] - h3=h3.split('More News', 1)[0] - #remove "collection" sets - while '
    ' in h2: - arr=h3.split('
    ', 1) - h3=arr[0]+arr[1].split('', 1)[1] - - #Grab the remaining URLs - while ' - -
    - -
    -

    Top News

    - -
    - - -''' diff --git a/scratch/do_not_delete b/scratch/do_not_delete deleted file mode 100644 index e69de29..0000000 diff --git a/spotCheck.py b/spotCheck.py deleted file mode 100755 index d1edda4..0000000 --- a/spotCheck.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python3 - - -from parser import * -from unbiasedObjects import * -import sys - -def spotCheck(src): - - fns = {'hil' : buildTheHill, - 'cbs' : buildCBS, - 'npr' : buildNPR, - 'fox' : buildFoxNews, - 'gdn' : buildGuardian, - 'blz' : buildBlaze, - 'bbc' : buildBBC, - 'nbc' : buildNBC, - 'wat' : buildWashTimes, - 'csm' : buildCSM, - 'abc' : buildABC} - - data=fns[src]() - - print('H1s:\n--------------') - for h in data.h1Arr: - print(h.title) - - print('\n\nH2s:\n--------------') - for h in data.h2Arr: - print(h.title) - - print('\n\nH3s:\n--------------') - for h in data.h3Arr: - print(h.title) - - print('\n\n') - - - -if __name__=='__main__': - spotCheck(sys.argv[1]) diff --git a/unbiased/html_template/newtemplate.html b/unbiased/html_template/newtemplate.html new file mode 100644 index 0000000..0cec766 --- /dev/null +++ b/unbiased/html_template/newtemplate.html @@ -0,0 +1,150 @@ + + + + + + UnBiased + + + + + + + +
    + Sources: BBC US, NBC News, CBS News, The Blaze, Weekly Standard, New York Times, Fox News +
    + + diff --git a/unbiased/html_template/template.html b/unbiased/html_template/template.html new file mode 100755 index 0000000..fc17006 --- /dev/null +++ b/unbiased/html_template/template.html @@ -0,0 +1,173 @@ + + + + + + + UnBiased + + + + + + + +
    + Sources: xxSourcesxx +
    + + diff --git a/unbiased/html_template/unbiased.css b/unbiased/html_template/unbiased.css new file mode 100755 index 0000000..244f100 --- /dev/null +++ b/unbiased/html_template/unbiased.css @@ -0,0 +1,220 @@ +/*body{ + width:900px; + margin-left:auto; + margin-right:auto; +}*/ + + +body{ + margin:0; +} + +a:link, a:visited, a:hover, a:active { + color: #00f; + text-decoration:none; + } + +a:hover{ + cursor:pointer; +} + +#page-header{ + width:100%; + text-align:center; + padding:.5em 0 1em; + margin-bottom:1em; + border-bottom:3px solid #BB133E; + background:#002147; +} + +.title{ + font-size:3em; +} + +#title-1{ + font-style:italic; + color:#fff; +} + +#title-2{ + color:#fff; +} + +#subtitle{ + font-size:1.25em; + color:#ccc; +} + +#timestamp{ + margin:.5em 0 0 0; + font-size:.8em; + color:#cc6; +} + +#page-container{ + width:900px; + margin-left:auto; + margin-right:auto; +} + +@media only screen and (max-width:900px){ + #page-container{ + width:100% + } +} + +#top-stories{ + width:95%; + display:block; + overflow:auto; + padding:10px; + margin-left:auto; + margin-right:auto; + text-align:center; + border-bottom: 3px solid #BB133E; + margin-bottom: 10px; +} + +.row{ + display:flex; +} + +.top-story{ + display:inline-block; + vertical-align:top; + text-align:left; + width:360px; + height:auto; + overflow:hidden; + background:#fff; + margin:10px; + padding:10px; + border:2px solid #ccc; + flex:1; +} + +@media only screen and (max-width:500px){ + .row{ + display:block; + } + .top-story{ + display:block; + width:auto; + height:auto; + } +} + +.top-stories-img{ + width:350px; + height:200px; + overflow:hidden; + background-size: auto 234px;/*cover;*/ + background-position: top center;/*center center;*/ + margin:0 auto; +} + +@media only screen and (max-width:500px){ + .top-stories-img{ + width:auto; + } +} + + +.top-stories-hed{ + font-weight:bold; + font-size:1.35em; + margin:10px 10px 0; + color:#00f; +} + +.top-stories-desc{ + font-size:1em; + padding-top:.5em; + margin:0 .75em; +} + +#middle-stories{ + clear:both; + width:500px; + margin:0 auto; + padding:0; + display:block; + overflow:auto; + float:left; +} + +@media only screen and (max-width:500px){ + #middle-stories{ + width:100%; + float:none; + } +} + +.middle-story{ + margin:5px 10px; + padding:10px; + background:#fff; + border:2px solid #ddd; + width:460px; + float:left; +} + +@media only screen and (max-width:500px){ + .middle-story{ + width:auto; + } +} + +.middle-stories-img{ + width:150px; + height:100px; + overflow:hidden; + background-size: auto 117px;/*cover;*/ + background-position: top center;/*center center;*/ + float:left; + max-width:35%; +} + +.middle-stories-hed{ + font-size:1.2em; + float:left; + width:300px; + margin-left:10px; + color:#00f; +} + +@media only screen and (max-width:500px){ + .middle-stories-hed{ + max-width:60%; + } +} + +#bottom-stories{ + margin:0 10px; + padding:10px; + display:block; + overflow:auto; + float:left; + width:350px; + border:5px solid #ddd; +} + +@media only screen and (max-width:900px){ + #bottom-stories{ + width:auto; + border-width:3px; + float:none; + } +} + +.bottom-story{ color:#00f; + + padding:15px 0; + color:#00f; +} + +#sources{ + clear:both; + padding-top:4em; + font-size:.8em; +} \ No newline at end of file diff --git a/unbiased/main.py b/unbiased/main.py new file mode 100755 index 0000000..f1c3317 --- /dev/null +++ b/unbiased/main.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 + +import argparse +import os + +from unbiasedObjects import * +from unbiasedFunctions import * +from parser import * +import time + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('-w', '--webroot', default='/var/www/ubiased', help='location to write the output html') + args = parser.parse_args() + + while True: + print('-----------------------') + run(args.webroot) + print('-----------------------') + time.sleep(600) + +def run(webroot): + sourceList=[] + + ''' + + SOURCES TO ADD NEXT: + -ABC + -REUTERS + -Town Hall + + ''' + + print('running with webroot="{}"'.format(webroot)) + + + ### These values have to be the second half of the function name + ### E.g. Guardian calls buildGuardian(), etc. + sourceFnArr=['Guardian', 'TheHill', 'NPR', 'BBC', 'NBC', 'CBS', + 'FoxNews', 'WashTimes', 'CSM', 'ABC'] #'Blaze' + + for source in sourceFnArr: + tries=0 + while tries<3: + try: + fn='build'+source + possibles = globals().copy() + possibles.update(locals()) + method = possibles.get(fn) + src=method() + sourceList.append(src) + break + except: + print('Build error. Looping again: '+source) + tries+=1 + time.sleep(tries) + + #scrape all urls and build data structure + newsSourceArr=buildNewsSourceArr(sourceList) + + #build the output file HTML + outputHTML=buildOutput(newsSourceArr) + + #print the output file HTML + printOutputHTML(outputHTML, os.path.join(webroot, 'index.html')) + + +if __name__=="__main__": + main() diff --git a/unbiased/parser.py b/unbiased/parser.py new file mode 100755 index 0000000..f69281b --- /dev/null +++ b/unbiased/parser.py @@ -0,0 +1,986 @@ +#!/usr/bin/env python3 + +from unbiasedObjects import * +from unbiasedFunctions import buildArticle +import os +import re + + +''' +Takes in a URL, downloads the file to a temp file, +reads the file into a string, and returns that string +''' +def urlToContent(url, sourceEncoding='utf8'): + #download file + os.system('wget -q -O scratch/temp1.html --no-check-certificate '+url) + + #read file + if sourceEncoding=='utf8': + f=open('scratch/temp1.html', 'r', encoding="utf8") + else: + f=open('scratch/temp1.html', 'r', encoding="latin-1") + content=f.read() + f.close() + + return content + + +''' +Creates a new newsSource2 object. For each URL in h1-h3URLs, +calls the file scraper and appends the new Article object. +Returns a newsSource2 object +''' +def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs): + h1Arr=[] + a=buildArticle(h1URLs[0], name) + if a==None: + print('................\nH1 Nonetype in '+name+'\n................') + else: + h1Arr.append(a) + + h2Arr=[] + for x in h2URLs: + a=buildArticle(x, name) + if a!=None: + h2Arr.append(a) + else: + print('................\nH2 Nonetype in '+name+'\n................') + + + h3Arr=[] + for x in h3URLs: + a=buildArticle(x, name) + if a!=None: + h3Arr.append(a) + else: + print('................\nH3 Nonetype in '+name+'\n................') + + #BUILD THE NEWS SOURCE + newsSource=NewsSource2(name, url, h1Arr, h2Arr, h3Arr) + + return newsSource + + +''' +Some sites will replicate URLs across the page. This function removes them. +Check hierarchically: if h3 exists in h1s or h2s, remove from h3s; +if h2 exists in h1s, remove from h2s + +also check partial URLs (e.g. nytimes.com/story.html is the same as +nytimes.com/story.html?var=x +''' +def removeDuplicates(h1s, h2s, h3s): + #Assume h1s is one element, and keep it + + #remove h2 duplicates + removeArr=[] + for i in range(len(h2s)): + #check internally + for j in range(len(h2s)): + if i==j: + continue + else: + if h2s[i] in h2s[j]: + removeArr.append(h2s[j]) + #check against h1s + for k in range(len(h1s)): + if (h2s[i] in h1s[k]) or (h1s[k] in h2s[i]): + removeArr.append(h2s[i]) + for x in removeArr: + h2s.remove(x) + + #remove h3 duplicates + removeArr=[] + for i in range(len(h3s)): + #check internally + for j in range(len(h3s)): + if i==j: + continue + else: + if h3s[i] in h3s[j]: + removeArr.append(h3s[j]) + #check against h1s and h2s + h1and2=h1s+h2s + for k in range(len(h1and2)): + if (h3s[i] in h1and2[k]) or (h1and2[k] in h3s[i]): + removeArr.append(h3s[i]) + for x in removeArr: + if x in h3s: + h3s.remove(x) + + + return h1s, h2s, h3s + + + +def removalNotification(source, title, reason, value): + print('*************************') + print('\t\tSTORY REMOVED') + print('SOURCE: '+source) + print('TITLE: \t'+title) + print('REASON: '+reason) + print('VALUE: \t'+value) + print('*************************\n\n') + + +def removeBadStoriesHelper(source, element, badStringList, arr): + if badStringList!=None: + for i in range(len(arr)): + for hed in arr[i]: + if hed==None: + print("////////\nNone type found in removeBadStoriesHelper for "+source.name+"\n/////////") + break + for item in badStringList: + if item in getattr(hed, element): + arr[i].remove(hed) + #if it's in the h1 slot, bump up the + # first h2 into the h1 slot + if i==0: + arr[0].append(arr[1][0]) + arr[1].remove(arr[1][0]) + removalNotification(source.name, hed.title, element, item) + + +def removeBadStories(source, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr=None): + + arr=[source.h1Arr, source.h2Arr, source.h3Arr] + + removeBadStoriesHelper(source, "title", badTitleArr, arr) + removeBadStoriesHelper(source, "description", badDescArr, arr) + removeBadStoriesHelper(source, "author", badAuthorArr, arr) + removeBadStoriesHelper(source, "img", badImgArr, arr) + removeBadStoriesHelper(source, "url", badURLArr, arr) + + return source + + + + +def buildTheHill(): + url='http://thehill.com' + name='The Hill' + + #DOWNLOAD HOMEPAGE CONTENT + content=urlToContent(url) + + #get main headline + h1=content + h1=h1.split('
    ', 1)[1] + h1=h1.split('', 1)[1] + h2=h2.split('', 1)[0] + while '
    ', 1)[1] + h3=h3.split('', 1)[0] + while '
    ')[2:] + for x in h2: + if '

    ', 1)[1] + h3=h3.split('
    ', 1)[0]#'', 1)[0] + while '
  • ' in h2: + h2=h2.split('
  • ', 1)[1] + h2=h2.split('', 1)[1] + h2=h2.split('
    ', 1)[1] + h3=h3.split('Watch/Listen', 1)[0] + while '
    ', 1)[1] + h1=h1.split('href="', 1)[1] + h1=h1.split('"', 1)[0] + h1s=[h1] + + #GET SECONDARY HEADLINES + h2=content + h2s=[] + h2=h2.split('
    ', 1)[1] + h2=h2.split('
    ' in h2: + h2=h2.split('
    ', 1)[1] + h2=h2.split('href="', 1)[1] + x=h2.split('"', 1)[0] + if h1 not in x: + h2s.append(x) + + #GET TERTIARY HEADLINES + h3=content + h3s=[] + h3=h3.split('Today\'s Standard', 1)[1] + h3=h3.split('
    ' in h3: + h3=h3.split('
    ', 1)[1] + h3=h3.split('href="', 1)[1] + x=h3.split('"', 1)[0] + if h1 not in x: + h3s.append(x) + + #Need to add URL prefix to all URLs + for i in range(len(h1s)): + h1s[i]=url+h1s[i] + for i in range(len(h2s)): + h2s[i]=url+h2s[i] + for i in range(len(h3s)): + h3s[i]=url+h3s[i] + + + h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) + wkl=buildNewsSource2(name, url, h1s, h2s, h3s) + + #REMOVE BAD STORIES + badTitleArr=None + ## if flagged again, remove Micah Mattix + badDescArr=['Matt Labash'] + badAuthorArr=['MATT LABASH', 'TWS PODCAST', 'ERIC FELTEN', 'Steven J. Lenzner', 'MARK HEMINGWAY'] + badImgArr=['http://www.weeklystandard.com/s3/tws15/images/twitter/tws-twitter_1024x512.png'] + wkl=removeBadStories(wkl, badTitleArr, badDescArr, badAuthorArr, badImgArr) + + return wkl + + + + +def buildNPR(): + url='http://www.npr.org/sections/news/' + name='NPR' + + #DOWNLOAD HOMEPAGE CONTENT + content=urlToContent(url) + + #get main headline + h1=content + h1=h1.split('', 1)[1] + h1=h1.split('', 1)[1] + h2=h2.split('', 1)[0] + while '
    ' in h2: + h2=h2.split('
    ', 1)[1] + h2=h2.split('', 1)[1] + h2=h2.split('
    ', 1)[0] + while '', 1)[1] + h1=h1.split('
    ', 1)[1] + h2=h2.split('', 1)[0] + #remove "collection" sets + while '
    ' in h2: + arr=h2.split('
    ', 1) + h2=arr[0]+arr[1].split('', 1)[1] + #Grab the remaining URLs + while '', 1)[1] + h3=h3.split('', 1)[0] + #remove "collection" sets + while '
    ' in h3: + arr=h3.split('
    ', 1) + h3=arr[0]+arr[1].split('', 1)[1] + #Grab the remaining URLs + while '' in h3: + h3=h3.split('', 1)[1] + h3=h3.split('', 1)[0] + elif '/video/the-daily-360' in h3: + h3=h3.split('/video/the-daily-360')[-1] + h3=h3.split('More News', 1)[0] + #remove "collection" sets + while '
    ' in h2: + arr=h3.split('
    ', 1) + h3=arr[0]+arr[1].split('', 1)[1] + + #Grab the remaining URLs + while ' + +
    + +
    +

    Top News

    + +
    + + +''' diff --git a/unbiased/scratch/do_not_delete b/unbiased/scratch/do_not_delete new file mode 100644 index 0000000..e69de29 diff --git a/unbiased/spotCheck.py b/unbiased/spotCheck.py new file mode 100755 index 0000000..d1edda4 --- /dev/null +++ b/unbiased/spotCheck.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 + + +from parser import * +from unbiasedObjects import * +import sys + +def spotCheck(src): + + fns = {'hil' : buildTheHill, + 'cbs' : buildCBS, + 'npr' : buildNPR, + 'fox' : buildFoxNews, + 'gdn' : buildGuardian, + 'blz' : buildBlaze, + 'bbc' : buildBBC, + 'nbc' : buildNBC, + 'wat' : buildWashTimes, + 'csm' : buildCSM, + 'abc' : buildABC} + + data=fns[src]() + + print('H1s:\n--------------') + for h in data.h1Arr: + print(h.title) + + print('\n\nH2s:\n--------------') + for h in data.h2Arr: + print(h.title) + + print('\n\nH3s:\n--------------') + for h in data.h3Arr: + print(h.title) + + print('\n\n') + + + +if __name__=='__main__': + spotCheck(sys.argv[1]) diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py new file mode 100644 index 0000000..1a80d7a --- /dev/null +++ b/unbiased/unbiasedFunctions.py @@ -0,0 +1,259 @@ +from unbiasedObjects import * +import os +import random +import time +import re + + +#take in a url and delimiters, return twitter card +def buildArticle(url, sourceName, encoding=None):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd): + + debugging=False + if debugging: + print(sourceName) + print(url) + print() + + #download url + os.system('wget -q -O scratch/temp_article.html --no-check-certificate '+url) + + #read the file in + f=open('scratch/temp_article.html', 'r', encoding="utf8") + content=f.read() + f.close() + + try: + if sourceName=='The Guardian': + #The Guardian puts an identifying banner on their og:images + #grab the main image from the page instead + + #scenario 1: regular image + if '')[0] + elif sourceName=='ABC News': + img='https://c1.staticflickr.com/7/6042/6276688407_12900948a2_b.jpgX' + if img[-1]=='/': + #because the quote separator could be ' or ", + #trim to just before it then lop it off + img=img[:-1].strip() + img=img[:-1] + + if debugging: + print(img) + + title=content.split('og:title" content=')[1][1:].split('>')[0] + if title[-1]=='/': + title=title[:-1].strip() + title=title[:-1] + + if debugging: + print(title) + + + author='' + if sourceName=='The Blaze': + if 'class="article-author">' in content: + author=content.split('class="article-author">')[1].split('<')[0] + elif 'class="article-author" href="' in content: + author=content.split('class="article-author" href="')[1] + author=author.split('>')[1].split('<')[0].strip() + else: + authorTags=['article:author', 'dc.creator', 'property="author'] + for tag in authorTags: + if tag in content: + author=content.split(tag+'" content=')[1][1:].split('>')[0] + author=author[:-1] + #trim an extra quotation mark for The Hill + if sourceName=='The Hill': + author=author.split('"', 1)[0] + break + + if debugging: + print(author) + + + if 'og:description' in content: + description=content.split('og:description" content=')[1][1:].split('>')[0] + if description[-1]=='/': + description=description[:-1].strip() + description=description[:-1] + else: + if sourceName=='The Hill': + description=content.split('div class="field-items"')[-1] + description=re.sub('<[^<]+?>', '', description) + description=description[1:200] + else: + print("SHOULDN'T GET HERE") + + #strip out self-references + description=description.replace(sourceName+"'s", '***') + description=description.replace(sourceName+"'", '***') + description=description.replace(sourceName, '***') + + if debugging: + print(description) + + + a=Article(title, url, img, description, sourceName, author) + return a + + except: + print('^^^^^^^^^^^^^^^^^^^^^^^^^') + print('\tARTICLE PARSING ERROR') + print('SOURCE: '+sourceName) + print('URL: \t'+url) + print('^^^^^^^^^^^^^^^^^^^^^^^^^ \n\n') + return None + + +def buildOutput(newsSourceArr): + #read in the template html file + f=open('html_template/template.html', 'r') + template=f.read() + f.close() + + #set the random order for sources + h1RandomSources=[] + while len(h1RandomSources)<4: + x=random.sample(range(len(newsSourceArr)), 1)[0] + if len(newsSourceArr[x].h1Arr)>0: + if x not in h1RandomSources: + h1RandomSources.append(x) + else: + print('\n\n@@@@\nNo H1 stories in '+newsSourceArr[x].name+'\n@@@@\n\n') + + #For h2s and h3s, select N random sources (can repeat), then + #a non-repetitive random article from within + h2RandomPairs=[] + while len(h2RandomPairs) < 6: + x=random.sample(range(len(newsSourceArr)), 1)[0] + if len(newsSourceArr[x].h2Arr) > 0: + y=random.sample(range(len(newsSourceArr[x].h2Arr)), 1)[0] + pair=[x,y] + if not pair in h2RandomPairs: + h2RandomPairs.append(pair) + else: + print('\n\n@@@@\nNo H2 stories in '+newsSourceArr[x].name+'\n@@@@\n\n') + + h3RandomPairs=[] + while len(h3RandomPairs) < 12: + x=random.sample(range(len(newsSourceArr)), 1)[0] + print(newsSourceArr[x].name) + if len(newsSourceArr[x].h3Arr) > 0: + y=random.sample(range(len(newsSourceArr[x].h3Arr)), 1)[0] + pair=[x,y] + if not pair in h3RandomPairs: + h3RandomPairs.append(pair) + else: + print('\n\n@@@@\nNo H3 stories in '+newsSourceArr[x].name+'\n@@@@\n\n') + + #replace html template locations with data from newsSourceArr + for i in range(len(h1RandomSources)): + source=newsSourceArr[h1RandomSources[i]] + randomArticle=random.sample(range(len(source.h1Arr)), 1)[0] + article=source.h1Arr[randomArticle] + template=template.replace('xxURL1-'+str(i+1)+'xx', article.url) + template=template.replace('xxTitle1-'+str(i+1)+'xx', article.title) + template=template.replace('xxImg1-'+str(i+1)+'xx', article.img) + desc=article.description + if len(desc)>144: + desc=desc[:141] + desc=desc.split()[:-1] + desc=' '.join(desc)+' ...' + template=template.replace('xxDesc1-'+str(i+1)+'xx', desc) + + for i in range(len(h2RandomPairs)): + pair=h2RandomPairs[i] + article=newsSourceArr[pair[0]].h2Arr[pair[1]] + template=template.replace('xxURL2-'+str(i+1)+'xx', article.url) + template=template.replace('xxTitle2-'+str(i+1)+'xx', article.title) + template=template.replace('xxImg2-'+str(i+1)+'xx', article.img) + + for i in range(len(h3RandomPairs)): + pair=h3RandomPairs[i] + article=newsSourceArr[pair[0]].h3Arr[pair[1]] + template=template.replace('xxURL3-'+str(i+1)+'xx', article.url) + template=template.replace('xxTitle3-'+str(i+1)+'xx', article.title) + template=template.replace('xxImg3-'+str(i+1)+'xx', article.img) + + + sourcesStr='' + for i in range(len(newsSourceArr)-1): + sourcesStr+=newsSourceArr[i].name+', ' + sourcesStr+=newsSourceArr[-1].name + print('Successfully parsed: '+sourcesStr) + template=template.replace('xxSourcesxx', sourcesStr) + + + #return updated text + return template + +def printOutputHTML(outputHTML, outFile): + timestamp=time.strftime("%a, %b %-d, %-I:%M%P %Z", time.localtime()) + outputHTML=outputHTML.replace('xxTimexx', timestamp) + + f=open(outFile, 'w') + f.write(outputHTML) + f.close() + +def buildNewsSourceArr(sourceList): + + #build the data structure + i=0 + listLen=len(sourceList) + while i < listLen: + source=sourceList[i] + + if type(source) is NewsSource2: + i+=1 + continue + + url=source.url + + #download file + os.system('wget -q -O scratch/temp'+str(i)+'.html --no-check-certificate '+url) + + #read file + f=open('scratch/temp'+str(i)+'.html', 'r', encoding="utf8") + content=f.read() + f.close() + + #delete file MAYBE DON'T DO THIS? CAUSES OS ERRORS + #os.remove('scratch/temp'+str(i)+'.html') + + #add stories etc to the NewsSource object + h1s, h2s, h3s=extractURLs(content, source) + + #build the Article objects and add to newsSource's appropriate list + if h1s!=None and h2s!=None: + for url in h1s: + article=buildArticle(url, source.name) + if article!=None: source.addArticle(article, 1) #sourceList[i].h1Arr.append(article) + for url in h2s: + article=buildArticle(url, source.name) + if article!=None: sourceList[i].h2Arr.append(article) + for url in h3s: + article=buildArticle(url, source.name) + if article!=None: sourceList[i].h3Arr.append(article) + i+=1 + else: + sourceList.remove(source) + listLen-=1 + + + #return the original sourceList, + #since everything should have been modified in place + return sourceList diff --git a/unbiased/unbiasedObjects.py b/unbiased/unbiasedObjects.py new file mode 100644 index 0000000..3affbe6 --- /dev/null +++ b/unbiased/unbiasedObjects.py @@ -0,0 +1,90 @@ +class Article(): + title='' + url='' + img='' + description='' + source='' + author='' + + def __init__(self, title, url, img, description, source, author): + self.title=title + self.url=url + self.img=img + self.description=description + self.source=source + self.author=author + + def __str__(self): + return '-----------\n'+self.title+'\n'+self.author+'\n'+self.source+'\n'+self.description+'\n'+self.url+'\n'+self.img+'\n'+'-----------' + + +class NewsSource2(): + name='' + url='' + h1Arr=[] + h2Arr=[] + h3Arr=[] + def __init__(self, name, url, h1Arr, h2Arr, h3Arr): + self.name=name + self.url=url + self.h1Arr=h1Arr + self.h2Arr=h2Arr + self.h3Arr=h3Arr + + + +class NewsSource(): + name='' + url='' + #multiple start values to step through file. end value default to '"' + h1SectionDividerStart=None + h1SectionDividerEnd=None + h1DelStart=[] + h1DelEnd='"' + h2SectionDividerStart=None + h2SectionDividerEnd=None + h2DelStart=[] + h2DelEnd='"' + h3SectionDividerStart=None + h3SectionDividerEnd=None + h3DelStart=[] + h3DelEnd='"' + #arrays of Article object types + h1Arr=None + h2Arr=None + h3Arr=None + #url to attach to stub links + stubURL='' + + def __init__(self, name, url, + h1DelStart, h2DelStart, h3DelStart, + h1SectionDividerStart=None, h1SectionDividerEnd=None, + h2SectionDividerStart=None, h2SectionDividerEnd=None, + h3SectionDividerStart=None, h3SectionDividerEnd=None, + stubURL=None): + self.name=name + self.url=url + self.h1DelStart=h1DelStart + self.h2DelStart=h2DelStart + self.h3DelStart=h3DelStart + self.h1SectionDividerStart=h1SectionDividerStart + self.h2SectionDividerStart=h2SectionDividerStart + self.h3SectionDividerStart=h3SectionDividerStart + self.h1SectionDividerEnd=h1SectionDividerEnd + self.h2SectionDividerEnd=h2SectionDividerEnd + self.h3SectionDividerEnd=h3SectionDividerEnd + self.h1Arr=[] + self.h2Arr=[] + self.h3Arr=[] + self.stubURL=stubURL + + def addArticle(self, article, level): + if level==1: + self.h1Arr.append(article) + elif level==2: + self.h2Arr.append(article) + elif level==3: + self.h3Arr.append(article) + else: + print("Error: invalid level in NewsSource.addArtlce: ", level) + diff --git a/unbiasedFunctions.py b/unbiasedFunctions.py deleted file mode 100644 index 1a80d7a..0000000 --- a/unbiasedFunctions.py +++ /dev/null @@ -1,259 +0,0 @@ -from unbiasedObjects import * -import os -import random -import time -import re - - -#take in a url and delimiters, return twitter card -def buildArticle(url, sourceName, encoding=None):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd): - - debugging=False - if debugging: - print(sourceName) - print(url) - print() - - #download url - os.system('wget -q -O scratch/temp_article.html --no-check-certificate '+url) - - #read the file in - f=open('scratch/temp_article.html', 'r', encoding="utf8") - content=f.read() - f.close() - - try: - if sourceName=='The Guardian': - #The Guardian puts an identifying banner on their og:images - #grab the main image from the page instead - - #scenario 1: regular image - if '')[0] - elif sourceName=='ABC News': - img='https://c1.staticflickr.com/7/6042/6276688407_12900948a2_b.jpgX' - if img[-1]=='/': - #because the quote separator could be ' or ", - #trim to just before it then lop it off - img=img[:-1].strip() - img=img[:-1] - - if debugging: - print(img) - - title=content.split('og:title" content=')[1][1:].split('>')[0] - if title[-1]=='/': - title=title[:-1].strip() - title=title[:-1] - - if debugging: - print(title) - - - author='' - if sourceName=='The Blaze': - if 'class="article-author">' in content: - author=content.split('class="article-author">')[1].split('<')[0] - elif 'class="article-author" href="' in content: - author=content.split('class="article-author" href="')[1] - author=author.split('>')[1].split('<')[0].strip() - else: - authorTags=['article:author', 'dc.creator', 'property="author'] - for tag in authorTags: - if tag in content: - author=content.split(tag+'" content=')[1][1:].split('>')[0] - author=author[:-1] - #trim an extra quotation mark for The Hill - if sourceName=='The Hill': - author=author.split('"', 1)[0] - break - - if debugging: - print(author) - - - if 'og:description' in content: - description=content.split('og:description" content=')[1][1:].split('>')[0] - if description[-1]=='/': - description=description[:-1].strip() - description=description[:-1] - else: - if sourceName=='The Hill': - description=content.split('div class="field-items"')[-1] - description=re.sub('<[^<]+?>', '', description) - description=description[1:200] - else: - print("SHOULDN'T GET HERE") - - #strip out self-references - description=description.replace(sourceName+"'s", '***') - description=description.replace(sourceName+"'", '***') - description=description.replace(sourceName, '***') - - if debugging: - print(description) - - - a=Article(title, url, img, description, sourceName, author) - return a - - except: - print('^^^^^^^^^^^^^^^^^^^^^^^^^') - print('\tARTICLE PARSING ERROR') - print('SOURCE: '+sourceName) - print('URL: \t'+url) - print('^^^^^^^^^^^^^^^^^^^^^^^^^ \n\n') - return None - - -def buildOutput(newsSourceArr): - #read in the template html file - f=open('html_template/template.html', 'r') - template=f.read() - f.close() - - #set the random order for sources - h1RandomSources=[] - while len(h1RandomSources)<4: - x=random.sample(range(len(newsSourceArr)), 1)[0] - if len(newsSourceArr[x].h1Arr)>0: - if x not in h1RandomSources: - h1RandomSources.append(x) - else: - print('\n\n@@@@\nNo H1 stories in '+newsSourceArr[x].name+'\n@@@@\n\n') - - #For h2s and h3s, select N random sources (can repeat), then - #a non-repetitive random article from within - h2RandomPairs=[] - while len(h2RandomPairs) < 6: - x=random.sample(range(len(newsSourceArr)), 1)[0] - if len(newsSourceArr[x].h2Arr) > 0: - y=random.sample(range(len(newsSourceArr[x].h2Arr)), 1)[0] - pair=[x,y] - if not pair in h2RandomPairs: - h2RandomPairs.append(pair) - else: - print('\n\n@@@@\nNo H2 stories in '+newsSourceArr[x].name+'\n@@@@\n\n') - - h3RandomPairs=[] - while len(h3RandomPairs) < 12: - x=random.sample(range(len(newsSourceArr)), 1)[0] - print(newsSourceArr[x].name) - if len(newsSourceArr[x].h3Arr) > 0: - y=random.sample(range(len(newsSourceArr[x].h3Arr)), 1)[0] - pair=[x,y] - if not pair in h3RandomPairs: - h3RandomPairs.append(pair) - else: - print('\n\n@@@@\nNo H3 stories in '+newsSourceArr[x].name+'\n@@@@\n\n') - - #replace html template locations with data from newsSourceArr - for i in range(len(h1RandomSources)): - source=newsSourceArr[h1RandomSources[i]] - randomArticle=random.sample(range(len(source.h1Arr)), 1)[0] - article=source.h1Arr[randomArticle] - template=template.replace('xxURL1-'+str(i+1)+'xx', article.url) - template=template.replace('xxTitle1-'+str(i+1)+'xx', article.title) - template=template.replace('xxImg1-'+str(i+1)+'xx', article.img) - desc=article.description - if len(desc)>144: - desc=desc[:141] - desc=desc.split()[:-1] - desc=' '.join(desc)+' ...' - template=template.replace('xxDesc1-'+str(i+1)+'xx', desc) - - for i in range(len(h2RandomPairs)): - pair=h2RandomPairs[i] - article=newsSourceArr[pair[0]].h2Arr[pair[1]] - template=template.replace('xxURL2-'+str(i+1)+'xx', article.url) - template=template.replace('xxTitle2-'+str(i+1)+'xx', article.title) - template=template.replace('xxImg2-'+str(i+1)+'xx', article.img) - - for i in range(len(h3RandomPairs)): - pair=h3RandomPairs[i] - article=newsSourceArr[pair[0]].h3Arr[pair[1]] - template=template.replace('xxURL3-'+str(i+1)+'xx', article.url) - template=template.replace('xxTitle3-'+str(i+1)+'xx', article.title) - template=template.replace('xxImg3-'+str(i+1)+'xx', article.img) - - - sourcesStr='' - for i in range(len(newsSourceArr)-1): - sourcesStr+=newsSourceArr[i].name+', ' - sourcesStr+=newsSourceArr[-1].name - print('Successfully parsed: '+sourcesStr) - template=template.replace('xxSourcesxx', sourcesStr) - - - #return updated text - return template - -def printOutputHTML(outputHTML, outFile): - timestamp=time.strftime("%a, %b %-d, %-I:%M%P %Z", time.localtime()) - outputHTML=outputHTML.replace('xxTimexx', timestamp) - - f=open(outFile, 'w') - f.write(outputHTML) - f.close() - -def buildNewsSourceArr(sourceList): - - #build the data structure - i=0 - listLen=len(sourceList) - while i < listLen: - source=sourceList[i] - - if type(source) is NewsSource2: - i+=1 - continue - - url=source.url - - #download file - os.system('wget -q -O scratch/temp'+str(i)+'.html --no-check-certificate '+url) - - #read file - f=open('scratch/temp'+str(i)+'.html', 'r', encoding="utf8") - content=f.read() - f.close() - - #delete file MAYBE DON'T DO THIS? CAUSES OS ERRORS - #os.remove('scratch/temp'+str(i)+'.html') - - #add stories etc to the NewsSource object - h1s, h2s, h3s=extractURLs(content, source) - - #build the Article objects and add to newsSource's appropriate list - if h1s!=None and h2s!=None: - for url in h1s: - article=buildArticle(url, source.name) - if article!=None: source.addArticle(article, 1) #sourceList[i].h1Arr.append(article) - for url in h2s: - article=buildArticle(url, source.name) - if article!=None: sourceList[i].h2Arr.append(article) - for url in h3s: - article=buildArticle(url, source.name) - if article!=None: sourceList[i].h3Arr.append(article) - i+=1 - else: - sourceList.remove(source) - listLen-=1 - - - #return the original sourceList, - #since everything should have been modified in place - return sourceList diff --git a/unbiasedObjects.py b/unbiasedObjects.py deleted file mode 100644 index 3affbe6..0000000 --- a/unbiasedObjects.py +++ /dev/null @@ -1,90 +0,0 @@ -class Article(): - title='' - url='' - img='' - description='' - source='' - author='' - - def __init__(self, title, url, img, description, source, author): - self.title=title - self.url=url - self.img=img - self.description=description - self.source=source - self.author=author - - def __str__(self): - return '-----------\n'+self.title+'\n'+self.author+'\n'+self.source+'\n'+self.description+'\n'+self.url+'\n'+self.img+'\n'+'-----------' - - -class NewsSource2(): - name='' - url='' - h1Arr=[] - h2Arr=[] - h3Arr=[] - def __init__(self, name, url, h1Arr, h2Arr, h3Arr): - self.name=name - self.url=url - self.h1Arr=h1Arr - self.h2Arr=h2Arr - self.h3Arr=h3Arr - - - -class NewsSource(): - name='' - url='' - #multiple start values to step through file. end value default to '"' - h1SectionDividerStart=None - h1SectionDividerEnd=None - h1DelStart=[] - h1DelEnd='"' - h2SectionDividerStart=None - h2SectionDividerEnd=None - h2DelStart=[] - h2DelEnd='"' - h3SectionDividerStart=None - h3SectionDividerEnd=None - h3DelStart=[] - h3DelEnd='"' - #arrays of Article object types - h1Arr=None - h2Arr=None - h3Arr=None - #url to attach to stub links - stubURL='' - - def __init__(self, name, url, - h1DelStart, h2DelStart, h3DelStart, - h1SectionDividerStart=None, h1SectionDividerEnd=None, - h2SectionDividerStart=None, h2SectionDividerEnd=None, - h3SectionDividerStart=None, h3SectionDividerEnd=None, - stubURL=None): - self.name=name - self.url=url - self.h1DelStart=h1DelStart - self.h2DelStart=h2DelStart - self.h3DelStart=h3DelStart - self.h1SectionDividerStart=h1SectionDividerStart - self.h2SectionDividerStart=h2SectionDividerStart - self.h3SectionDividerStart=h3SectionDividerStart - self.h1SectionDividerEnd=h1SectionDividerEnd - self.h2SectionDividerEnd=h2SectionDividerEnd - self.h3SectionDividerEnd=h3SectionDividerEnd - self.h1Arr=[] - self.h2Arr=[] - self.h3Arr=[] - self.stubURL=stubURL - - def addArticle(self, article, level): - if level==1: - self.h1Arr.append(article) - elif level==2: - self.h2Arr.append(article) - elif level==3: - self.h3Arr.append(article) - else: - print("Error: invalid level in NewsSource.addArtlce: ", level) - -- cgit v1.2.3 From 5b0c9c5daa36878513bcc5edbe87a5fe52fdbb82 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Mon, 17 Apr 2017 00:34:26 -0400 Subject: get it to run from the package --- .gitignore | 4 ++++ setup.py | 20 ++++++++++++++++++++ unbiased/__init__.py | 0 unbiased/main.py | 19 ++++++++++--------- unbiased/parser.py | 5 +++-- unbiased/spotCheck.py | 6 +++--- unbiased/unbiasedFunctions.py | 28 +++++++++++++++++----------- 7 files changed, 57 insertions(+), 25 deletions(-) create mode 100644 setup.py create mode 100644 unbiased/__init__.py diff --git a/.gitignore b/.gitignore index 90bf98d..238da47 100644 --- a/.gitignore +++ b/.gitignore @@ -6,5 +6,9 @@ legacy_py/ unbiased.html html_template/Penguins.jpg html_template/BAK* +build/ +dist/ +venv/ +unbiased.egg-info/ #* .#* diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..8b73e6d --- /dev/null +++ b/setup.py @@ -0,0 +1,20 @@ +from setuptools import setup + +setup( + name="unbiased", + version="0", + packages=['unbiased'], + package_data={ + 'unbiased': [ + 'html_template/*.html', + 'html_template/*.css', + ], + }, + install_requires=[ + ], + entry_points={ + 'console_scripts': [ + 'unbiased = unbiased.main:main', + ], + }, +) diff --git a/unbiased/__init__.py b/unbiased/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/unbiased/main.py b/unbiased/main.py index f1c3317..b8bd4cb 100755 --- a/unbiased/main.py +++ b/unbiased/main.py @@ -1,26 +1,26 @@ #!/usr/bin/env python3 import argparse -import os - -from unbiasedObjects import * -from unbiasedFunctions import * -from parser import * import time +from unbiased.unbiasedObjects import * +from unbiased.unbiasedFunctions import * +from unbiased.parser import * + def main(): parser = argparse.ArgumentParser() parser.add_argument('-w', '--webroot', default='/var/www/ubiased', help='location to write the output html') + parser.add_argument('-s', '--scratch', default='/opt/unbiased/scratch', help='writable scratch workspace') args = parser.parse_args() while True: print('-----------------------') - run(args.webroot) + run(args.webroot, args.scratch) print('-----------------------') time.sleep(600) -def run(webroot): +def run(webroot, scratch): sourceList=[] ''' @@ -33,6 +33,7 @@ def run(webroot): ''' print('running with webroot="{}"'.format(webroot)) + print('running with scratch="{}"'.format(scratch)) ### These values have to be the second half of the function name @@ -51,7 +52,7 @@ def run(webroot): src=method() sourceList.append(src) break - except: + except Exception: print('Build error. Looping again: '+source) tries+=1 time.sleep(tries) @@ -63,7 +64,7 @@ def run(webroot): outputHTML=buildOutput(newsSourceArr) #print the output file HTML - printOutputHTML(outputHTML, os.path.join(webroot, 'index.html')) + printOutputHTML(outputHTML, webroot) if __name__=="__main__": diff --git a/unbiased/parser.py b/unbiased/parser.py index f69281b..1f9bc5c 100755 --- a/unbiased/parser.py +++ b/unbiased/parser.py @@ -1,10 +1,11 @@ #!/usr/bin/env python3 -from unbiasedObjects import * -from unbiasedFunctions import buildArticle import os import re +from unbiased.unbiasedObjects import * +from unbiased.unbiasedFunctions import buildArticle + ''' Takes in a URL, downloads the file to a temp file, diff --git a/unbiased/spotCheck.py b/unbiased/spotCheck.py index d1edda4..7ce50d3 100755 --- a/unbiased/spotCheck.py +++ b/unbiased/spotCheck.py @@ -1,10 +1,10 @@ #!/usr/bin/env python3 - -from parser import * -from unbiasedObjects import * import sys +from unbiased.parser import * +from unbiased.unbiasedObjects import * + def spotCheck(src): fns = {'hil' : buildTheHill, diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py index 1a80d7a..3d3363b 100644 --- a/unbiased/unbiasedFunctions.py +++ b/unbiased/unbiasedFunctions.py @@ -1,8 +1,10 @@ -from unbiasedObjects import * import os +import pkgutil import random -import time import re +import time + +from unbiased.unbiasedObjects import * #take in a url and delimiters, return twitter card @@ -110,7 +112,7 @@ def buildArticle(url, sourceName, encoding=None):#, titleDelStart, titleDelEnd, a=Article(title, url, img, description, sourceName, author) return a - except: + except Exception: print('^^^^^^^^^^^^^^^^^^^^^^^^^') print('\tARTICLE PARSING ERROR') print('SOURCE: '+sourceName) @@ -121,9 +123,8 @@ def buildArticle(url, sourceName, encoding=None):#, titleDelStart, titleDelEnd, def buildOutput(newsSourceArr): #read in the template html file - f=open('html_template/template.html', 'r') - template=f.read() - f.close() + template=pkgutil.get_data('unbiased', 'html_template/template.html') + template = template.decode('utf8') #set the random order for sources h1RandomSources=[] @@ -201,13 +202,18 @@ def buildOutput(newsSourceArr): #return updated text return template -def printOutputHTML(outputHTML, outFile): +def printOutputHTML(outputHTML, outDir): timestamp=time.strftime("%a, %b %-d, %-I:%M%P %Z", time.localtime()) outputHTML=outputHTML.replace('xxTimexx', timestamp) - - f=open(outFile, 'w') - f.write(outputHTML) - f.close() + + with open(os.path.join(outDir, 'index.html'), 'w') as fp: + fp.write(outputHTML) + + # copy over the template css file + css = pkgutil.get_data('unbiased', 'html_template/unbiased.css') + css = css.decode('utf8') + with open(os.path.join(outDir, 'unbiased.css'), 'w') as fp: + fp.write(css) def buildNewsSourceArr(sourceList): -- cgit v1.2.3 From 6a0a5579ea9b3674f011eabd2a4c339100a66ba8 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Mon, 17 Apr 2017 13:44:46 -0400 Subject: read the scratch dir path on the command line --- unbiased/main.py | 7 +-- unbiased/parser.py | 100 ++++++++++++++++++++++-------------------- unbiased/unbiasedFunctions.py | 29 +++++++----- 3 files changed, 74 insertions(+), 62 deletions(-) diff --git a/unbiased/main.py b/unbiased/main.py index b8bd4cb..159a98b 100755 --- a/unbiased/main.py +++ b/unbiased/main.py @@ -49,16 +49,17 @@ def run(webroot, scratch): possibles = globals().copy() possibles.update(locals()) method = possibles.get(fn) - src=method() + src=method(scratch) sourceList.append(src) break - except Exception: + except Exception as ex: + print(ex) print('Build error. Looping again: '+source) tries+=1 time.sleep(tries) #scrape all urls and build data structure - newsSourceArr=buildNewsSourceArr(sourceList) + newsSourceArr=buildNewsSourceArr(sourceList, scratch) #build the output file HTML outputHTML=buildOutput(newsSourceArr) diff --git a/unbiased/parser.py b/unbiased/parser.py index 1f9bc5c..ea2a187 100755 --- a/unbiased/parser.py +++ b/unbiased/parser.py @@ -2,6 +2,7 @@ import os import re +import subprocess from unbiased.unbiasedObjects import * from unbiased.unbiasedFunctions import buildArticle @@ -11,15 +12,18 @@ from unbiased.unbiasedFunctions import buildArticle Takes in a URL, downloads the file to a temp file, reads the file into a string, and returns that string ''' -def urlToContent(url, sourceEncoding='utf8'): +def urlToContent(url, scratchDir, sourceEncoding='utf8'): + temp_file = os.path.join(scratchDir, 'temp1.html') + #download file - os.system('wget -q -O scratch/temp1.html --no-check-certificate '+url) + #os.system('wget -q -O scratch/temp1.html --no-check-certificate '+url) + subprocess.check_call(['wget', '-q', '-O', temp_file, '--no-check-certificate', url]) #read file if sourceEncoding=='utf8': - f=open('scratch/temp1.html', 'r', encoding="utf8") + f=open(temp_file, 'r', encoding="utf8") else: - f=open('scratch/temp1.html', 'r', encoding="latin-1") + f=open(temp_file, 'r', encoding="latin-1") content=f.read() f.close() @@ -31,9 +35,9 @@ Creates a new newsSource2 object. For each URL in h1-h3URLs, calls the file scraper and appends the new Article object. Returns a newsSource2 object ''' -def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs): +def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir): h1Arr=[] - a=buildArticle(h1URLs[0], name) + a=buildArticle(h1URLs[0], name, scratchDir) if a==None: print('................\nH1 Nonetype in '+name+'\n................') else: @@ -41,7 +45,7 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs): h2Arr=[] for x in h2URLs: - a=buildArticle(x, name) + a=buildArticle(x, name, scratchDir) if a!=None: h2Arr.append(a) else: @@ -50,7 +54,7 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs): h3Arr=[] for x in h3URLs: - a=buildArticle(x, name) + a=buildArticle(x, name, scratchDir) if a!=None: h3Arr.append(a) else: @@ -157,12 +161,12 @@ def removeBadStories(source, badTitleArr, badDescArr, badAuthorArr, badImgArr, b -def buildTheHill(): +def buildTheHill(scratchDir): url='http://thehill.com' name='The Hill' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) + content=urlToContent(url, scratchDir) #get main headline h1=content @@ -194,7 +198,7 @@ def buildTheHill(): h3s.append(url+x) h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - hil=buildNewsSource2(name, url, h1s, h2s, h3s) + hil=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) hil=removeBadStories(hil, ['THE MEMO'], None, ['Matt Schlapp', 'Juan Williams', 'Judd Gregg'], None, None) return hil @@ -203,14 +207,14 @@ def buildTheHill(): -def buildGuardian(): +def buildGuardian(scratchDir): url='http://www.theguardian.com/us' name='The Guardian US' while True: #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url, 'utf8') + content=urlToContent(url, scratchDir, 'utf8') #get main headline h1=content @@ -252,20 +256,20 @@ def buildGuardian(): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - gdn=buildNewsSource2(name, url, h1s, h2s, h3s) + gdn=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) gdn=removeBadStories(gdn, None, ['Tom McCarthy', 'Andy Hunter'], ['https://www.theguardian.com/profile/ben-jacobs'], None) return gdn -def buildWashTimes(): +def buildWashTimes(scratchDir): url='http://www.washingtontimes.com/' name='Washington Times' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) + content=urlToContent(url, scratchDir) #get main headline h1=content @@ -301,19 +305,19 @@ def buildWashTimes(): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - wat=buildNewsSource2(name, url, h1s, h2s, h3s) + wat=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) wat=removeBadStories(wat, None, None, None, None) return wat -def buildCSM(): +def buildCSM(scratchDir): url='http://www.csmonitor.com/USA' name='Christian Science Monitor' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) + content=urlToContent(url, scratchDir) #this makes sure we don't get '/USA' in the URL twice url=url.split('/USA')[0] @@ -364,7 +368,7 @@ def buildCSM(): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - csm=buildNewsSource2(name, url, h1s, h2s, h3s) + csm=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) badTitleArr=['Change Agent'] badDescArr=None @@ -384,7 +388,7 @@ in The Blaze articles by grabbing the first portion of the story instead def blazeFixDesc(articleArr): TAG_RE = re.compile(r'<[^>]+>') for i in range(len(articleArr)): - desc=urlToContent(articleArr[i].url) + desc=urlToContent(articleArr[i].url, scratchDir) desc=desc.split('
    ', 1)[1] desc=desc.split('

    ', 1)[1] desc=TAG_RE.sub('', desc) @@ -396,12 +400,12 @@ def blazeFixDesc(articleArr): -def buildBlaze(): +def buildBlaze(scratchDir): url='http://theblaze.com' name='The Blaze' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) + content=urlToContent(url, scratchDir) #get main headline h1=content @@ -435,7 +439,7 @@ def buildBlaze(): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - blz=buildNewsSource2(name, url, h1s, h2s, h3s) + blz=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) badTitleArr=['Tucker Carlson', 'Mark Levin'] badDescArr=['Lawrence Jones', 'Mike Slater'] @@ -455,12 +459,12 @@ def buildBlaze(): -def buildCBS(): +def buildCBS(scratchDir): url='http://cbsnews.com' name='CBS News' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) + content=urlToContent(url, scratchDir) #get main headline h1=content @@ -504,7 +508,7 @@ def buildCBS(): h3s.append(url+x) h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - cbs=buildNewsSource2(name, url, h1s, h2s, h3s) + cbs=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) cbs=removeBadStories(cbs, ['60 Minutes'], ['60 Minutes'], None, None, ['whats-in-the-news-coverart']) return cbs @@ -513,12 +517,12 @@ def buildCBS(): -def buildNBC(): +def buildNBC(scratchDir): url='http://nbcnews.com' name='NBC News' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) + content=urlToContent(url, scratchDir) #get main headline h1=content @@ -567,7 +571,7 @@ def buildNBC(): ''' h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - nbc=buildNewsSource2(name, url, h1s, h2s, h3s) + nbc=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) nbc=removeBadStories(nbc, None, ['First Read'], None, None, None) @@ -576,12 +580,12 @@ def buildNBC(): -def buildBBC(): +def buildBBC(scratchDir): url='http://www.bbc.com/news/world/us_and_canada' name='BBC US & Canada' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) + content=urlToContent(url, scratchDir) #get main headline h1=content @@ -615,7 +619,7 @@ def buildBBC(): h3s.append('http://www.bbc.com'+x) h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - bbc=buildNewsSource2(name, url, h1s, h2s, h3s) + bbc=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) badTitleArr=None badDescArr=None badAuthorArr=None @@ -638,12 +642,12 @@ def buildBBC(): -def buildWeeklyStandard(): +def buildWeeklyStandard(scratchDir): url='http://www.weeklystandard.com' name='Weekly Standard' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) + content=urlToContent(url, scratchDir) #get main headline h1=content @@ -688,7 +692,7 @@ def buildWeeklyStandard(): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - wkl=buildNewsSource2(name, url, h1s, h2s, h3s) + wkl=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) #REMOVE BAD STORIES badTitleArr=None @@ -703,12 +707,12 @@ def buildWeeklyStandard(): -def buildNPR(): +def buildNPR(scratchDir): url='http://www.npr.org/sections/news/' name='NPR' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) + content=urlToContent(url, scratchDir) #get main headline h1=content @@ -742,7 +746,7 @@ def buildNPR(): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - npr=buildNewsSource2(name, url, h1s, h2s, h3s) + npr=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) #REMOVE BAD STORIES badTitleArr=['The Two-Way'] @@ -757,12 +761,12 @@ def buildNPR(): -def buildABC(): +def buildABC(scratchDir): url='http://www.abcnews.go.com' name='ABC News' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) + content=urlToContent(url, scratchDir) #get main headline h1=content @@ -796,7 +800,7 @@ def buildABC(): h3s.append(x) h1s, h2s, h3s = removeDuplicates([h1], h2s, h3s) - abc=buildNewsSource2(name, url, h1s, h2s, h3s) + abc=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) #REMOVE BAD STORIES badTitleArr=None @@ -811,12 +815,12 @@ def buildABC(): -def buildFoxNews(): +def buildFoxNews(scratchDir): url='http://foxnews.com' name='Fox News' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) + content=urlToContent(url, scratchDir) #get main headline h1=content @@ -847,7 +851,7 @@ def buildFoxNews(): h3s.append(x) h1s, h2s, h3s = removeDuplicates([h1], h2s, h3s) - fox=buildNewsSource2(name, url, h1s, h2s, h3s) + fox=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) #REMOVE BAD STORIES badTitleArr=['O'Reilly', 'Fox News', 'Brett Baier', 'Tucker'] @@ -861,12 +865,12 @@ def buildFoxNews(): -def buildNYT(): +def buildNYT(scratchDir): url='http://www.nytimes.com' name='New York Times' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) + content=urlToContent(url, scratchDir) #get main headline #this will likely need if/else logic @@ -944,7 +948,7 @@ def buildNYT(): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - nyt=buildNewsSource2(name, url, h1s, h2s, h3s) + nyt=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) nyt=removeBadStories(nyt, None, None, None, None, ['https://www.nytimes.com/section/magazine', 'https://www.nytimes.com/newsletters/the-interpreter']) diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py index 3d3363b..6210ba8 100644 --- a/unbiased/unbiasedFunctions.py +++ b/unbiased/unbiasedFunctions.py @@ -2,25 +2,29 @@ import os import pkgutil import random import re +import subprocess import time from unbiased.unbiasedObjects import * #take in a url and delimiters, return twitter card -def buildArticle(url, sourceName, encoding=None):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd): +def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd): debugging=False if debugging: print(sourceName) print(url) print() - + + temp_article = os.path.join(scratchDir, 'temp_article.html') + #download url - os.system('wget -q -O scratch/temp_article.html --no-check-certificate '+url) + #os.system('wget -q -O scratch/temp_article.html --no-check-certificate '+url) + subprocess.check_call(['wget', '-q', '-O', temp_article, '--no-check-certificate', url]) #read the file in - f=open('scratch/temp_article.html', 'r', encoding="utf8") + f=open(temp_article, 'r', encoding="utf8") content=f.read() f.close() @@ -215,7 +219,7 @@ def printOutputHTML(outputHTML, outDir): with open(os.path.join(outDir, 'unbiased.css'), 'w') as fp: fp.write(css) -def buildNewsSourceArr(sourceList): +def buildNewsSourceArr(sourceList, scratchDir): #build the data structure i=0 @@ -229,16 +233,19 @@ def buildNewsSourceArr(sourceList): url=source.url + temp_file = os.path.join(scratchDir, 'temp{}.html'.format(i)) + #download file - os.system('wget -q -O scratch/temp'+str(i)+'.html --no-check-certificate '+url) + #os.system('wget -q -O scratch/temp'+str(i)+'.html --no-check-certificate '+url) + subprocess.check_call(['wget', '-q', '-O', temp_file, '--no-check-certificate', url]) #read file - f=open('scratch/temp'+str(i)+'.html', 'r', encoding="utf8") + f=open(temp_file, 'r', encoding="utf8") content=f.read() f.close() #delete file MAYBE DON'T DO THIS? CAUSES OS ERRORS - #os.remove('scratch/temp'+str(i)+'.html') + #os.remove(temp_file) #add stories etc to the NewsSource object h1s, h2s, h3s=extractURLs(content, source) @@ -246,13 +253,13 @@ def buildNewsSourceArr(sourceList): #build the Article objects and add to newsSource's appropriate list if h1s!=None and h2s!=None: for url in h1s: - article=buildArticle(url, source.name) + article=buildArticle(url, source.name, scratchDir) if article!=None: source.addArticle(article, 1) #sourceList[i].h1Arr.append(article) for url in h2s: - article=buildArticle(url, source.name) + article=buildArticle(url, source.name, scratchDir) if article!=None: sourceList[i].h2Arr.append(article) for url in h3s: - article=buildArticle(url, source.name) + article=buildArticle(url, source.name, scratchDir) if article!=None: sourceList[i].h3Arr.append(article) i+=1 else: -- cgit v1.2.3 From e5b8cdc8a02a1d6e026e2e016508a8ecb443e181 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Mon, 17 Apr 2017 14:32:09 -0400 Subject: fix fox urls --- unbiased/parser.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/unbiased/parser.py b/unbiased/parser.py index ea2a187..f068ae8 100755 --- a/unbiased/parser.py +++ b/unbiased/parser.py @@ -827,6 +827,7 @@ def buildFoxNews(scratchDir): h1=h1.split('

    Date: Mon, 17 Apr 2017 14:32:57 -0400 Subject: don't need scratch dir anymore --- unbiased/scratch/do_not_delete | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 unbiased/scratch/do_not_delete diff --git a/unbiased/scratch/do_not_delete b/unbiased/scratch/do_not_delete deleted file mode 100644 index e69de29..0000000 -- cgit v1.2.3 From fd5227f122adf65b8f5340751e037fce67e4d2c4 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Mon, 17 Apr 2017 15:52:21 -0400 Subject: use jinja templates to build the output --- setup.py | 1 + unbiased/html_template/unbiased.jinja.html | 69 ++++++++++++++++++++++++++++++ unbiased/unbiasedFunctions.py | 52 +++++++++++----------- 3 files changed, 98 insertions(+), 24 deletions(-) create mode 100644 unbiased/html_template/unbiased.jinja.html diff --git a/setup.py b/setup.py index 8b73e6d..0b43b93 100644 --- a/setup.py +++ b/setup.py @@ -11,6 +11,7 @@ setup( ], }, install_requires=[ + 'jinja2', ], entry_points={ 'console_scripts': [ diff --git a/unbiased/html_template/unbiased.jinja.html b/unbiased/html_template/unbiased.jinja.html new file mode 100644 index 0000000..297c4c4 --- /dev/null +++ b/unbiased/html_template/unbiased.jinja.html @@ -0,0 +1,69 @@ + + + + + + + UnBiased + + + + + +
    + + + +
    + + {% for story in middle_stories %} + + +
    +
    +
    +
    {{ story.title }}
    +
    +
    + + {% endfor %} + +
    + +
    + + {% for story in bottom_stories %} + +
    + {{ story.title }} +
    + + {% endfor %} + +
    + +
    + +
    + Sources: {{ sources }} +
    + + diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py index 6210ba8..192de8c 100644 --- a/unbiased/unbiasedFunctions.py +++ b/unbiased/unbiasedFunctions.py @@ -127,9 +127,13 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t def buildOutput(newsSourceArr): #read in the template html file - template=pkgutil.get_data('unbiased', 'html_template/template.html') - template = template.decode('utf8') - + from jinja2 import Environment, PackageLoader, select_autoescape + env = Environment( + loader=PackageLoader('unbiased', 'html_template'), + autoescape=select_autoescape(['html', 'xml']) + ) + template = env.get_template('unbiased.jinja.html') + #set the random order for sources h1RandomSources=[] while len(h1RandomSources)<4: @@ -139,9 +143,9 @@ def buildOutput(newsSourceArr): h1RandomSources.append(x) else: print('\n\n@@@@\nNo H1 stories in '+newsSourceArr[x].name+'\n@@@@\n\n') - + #For h2s and h3s, select N random sources (can repeat), then - #a non-repetitive random article from within + #a non-repetitive random article from within h2RandomPairs=[] while len(h2RandomPairs) < 6: x=random.sample(range(len(newsSourceArr)), 1)[0] @@ -165,34 +169,25 @@ def buildOutput(newsSourceArr): else: print('\n\n@@@@\nNo H3 stories in '+newsSourceArr[x].name+'\n@@@@\n\n') - #replace html template locations with data from newsSourceArr + # collect articles for each section + top_stories = [] for i in range(len(h1RandomSources)): source=newsSourceArr[h1RandomSources[i]] randomArticle=random.sample(range(len(source.h1Arr)), 1)[0] article=source.h1Arr[randomArticle] - template=template.replace('xxURL1-'+str(i+1)+'xx', article.url) - template=template.replace('xxTitle1-'+str(i+1)+'xx', article.title) - template=template.replace('xxImg1-'+str(i+1)+'xx', article.img) - desc=article.description - if len(desc)>144: - desc=desc[:141] - desc=desc.split()[:-1] - desc=' '.join(desc)+' ...' - template=template.replace('xxDesc1-'+str(i+1)+'xx', desc) + top_stories.append(article) + middle_stories = [] for i in range(len(h2RandomPairs)): pair=h2RandomPairs[i] article=newsSourceArr[pair[0]].h2Arr[pair[1]] - template=template.replace('xxURL2-'+str(i+1)+'xx', article.url) - template=template.replace('xxTitle2-'+str(i+1)+'xx', article.title) - template=template.replace('xxImg2-'+str(i+1)+'xx', article.img) + middle_stories.append(article) + bottom_stories = [] for i in range(len(h3RandomPairs)): pair=h3RandomPairs[i] article=newsSourceArr[pair[0]].h3Arr[pair[1]] - template=template.replace('xxURL3-'+str(i+1)+'xx', article.url) - template=template.replace('xxTitle3-'+str(i+1)+'xx', article.title) - template=template.replace('xxImg3-'+str(i+1)+'xx', article.img) + bottom_stories.append(article) sourcesStr='' @@ -200,11 +195,20 @@ def buildOutput(newsSourceArr): sourcesStr+=newsSourceArr[i].name+', ' sourcesStr+=newsSourceArr[-1].name print('Successfully parsed: '+sourcesStr) - template=template.replace('xxSourcesxx', sourcesStr) - + + timestamp=time.strftime("%a, %b %-d, %-I:%M%P %Z", time.localtime()) + + html = template.render( + timestamp = timestamp, + top_stories = top_stories, + middle_stories = middle_stories, + bottom_stories = bottom_stories, + sources = sourcesStr, + ) + #return updated text - return template + return html def printOutputHTML(outputHTML, outDir): timestamp=time.strftime("%a, %b %-d, %-I:%M%P %Z", time.localtime()) -- cgit v1.2.3 From 26f93f9c17dcf81c69b814d9d402cd20ef32e1ef Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Mon, 17 Apr 2017 16:00:45 -0400 Subject: clean up template formatting --- unbiased/html_template/unbiased.jinja.html | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unbiased/html_template/unbiased.jinja.html b/unbiased/html_template/unbiased.jinja.html index 297c4c4..778bebc 100644 --- a/unbiased/html_template/unbiased.jinja.html +++ b/unbiased/html_template/unbiased.jinja.html @@ -23,9 +23,9 @@
    -
    {{ story.title }}
    +
    {{ story.title|safe }}
    -
    {{ story.desc }}
    +
    {{ story.description|safe|truncate(140) }}
    {% endfor %} @@ -40,7 +40,7 @@
    -
    {{ story.title }}
    +
    {{ story.title|safe }}
    @@ -53,7 +53,7 @@ {% for story in bottom_stories %}
    - {{ story.title }} + {{ story.title|safe }}
    {% endfor %} -- cgit v1.2.3 From 8bce5c2280441760db850d92d651d2fb0f181c50 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Mon, 17 Apr 2017 21:53:42 -0400 Subject: pull the images locally and resize --- setup.py | 1 + unbiased/main.py | 2 +- unbiased/unbiasedFunctions.py | 27 +++++++++++++++++++++++++-- 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 0b43b93..2755304 100644 --- a/setup.py +++ b/setup.py @@ -12,6 +12,7 @@ setup( }, install_requires=[ 'jinja2', + 'Pillow', ], entry_points={ 'console_scripts': [ diff --git a/unbiased/main.py b/unbiased/main.py index 159a98b..88ceb7e 100755 --- a/unbiased/main.py +++ b/unbiased/main.py @@ -62,7 +62,7 @@ def run(webroot, scratch): newsSourceArr=buildNewsSourceArr(sourceList, scratch) #build the output file HTML - outputHTML=buildOutput(newsSourceArr) + outputHTML=buildOutput(newsSourceArr, webroot) #print the output file HTML printOutputHTML(outputHTML, webroot) diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py index 192de8c..16ea07d 100644 --- a/unbiased/unbiasedFunctions.py +++ b/unbiased/unbiasedFunctions.py @@ -7,6 +7,8 @@ import time from unbiased.unbiasedObjects import * +from PIL import Image + #take in a url and delimiters, return twitter card def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd): @@ -125,7 +127,7 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t return None -def buildOutput(newsSourceArr): +def buildOutput(newsSourceArr, webroot): #read in the template html file from jinja2 import Environment, PackageLoader, select_autoescape env = Environment( @@ -170,17 +172,25 @@ def buildOutput(newsSourceArr): print('\n\n@@@@\nNo H3 stories in '+newsSourceArr[x].name+'\n@@@@\n\n') # collect articles for each section + image_index = 0 + top_stories = [] for i in range(len(h1RandomSources)): source=newsSourceArr[h1RandomSources[i]] randomArticle=random.sample(range(len(source.h1Arr)), 1)[0] article=source.h1Arr[randomArticle] + img_name = pullImage(article.img, image_index, webroot, 350, 200) + image_index += 1 + article.img = img_name top_stories.append(article) middle_stories = [] for i in range(len(h2RandomPairs)): pair=h2RandomPairs[i] article=newsSourceArr[pair[0]].h2Arr[pair[1]] + img_name = pullImage(article.img, image_index, webroot, 150, 100) + image_index += 1 + article.img = img_name middle_stories.append(article) bottom_stories = [] @@ -189,7 +199,6 @@ def buildOutput(newsSourceArr): article=newsSourceArr[pair[0]].h3Arr[pair[1]] bottom_stories.append(article) - sourcesStr='' for i in range(len(newsSourceArr)-1): sourcesStr+=newsSourceArr[i].name+', ' @@ -274,3 +283,17 @@ def buildNewsSourceArr(sourceList, scratchDir): #return the original sourceList, #since everything should have been modified in place return sourceList + +def pullImage(url, index, webroot, width=350, height=200): + extension = url.split('.')[-1].split('?')[0] + img_name = 'img{}.{}'.format(index, extension) + out_file = os.path.join(webroot, img_name) + try: + subprocess.check_call(['wget', '-q', '-O', out_file, '--no-check-certificate', url]) + except Exception: + return '' + img = Image.open(out_file) + img.resize((width, height)) + jpg_name = 'img{}.jpg'.format(index) + img.save(os.path.join(webroot, jpg_name), 'JPEG') + return jpg_name -- cgit v1.2.3 From c3d54bbe304708693891fe68cf3760c5fb2545b3 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Mon, 17 Apr 2017 22:59:02 -0400 Subject: replace print statements with the logging module --- unbiased/main.py | 19 +++++++++++++------ unbiased/parser.py | 25 +++++++++++++------------ unbiased/unbiasedFunctions.py | 34 ++++++++++++++++------------------ unbiased/unbiasedObjects.py | 6 +++++- 4 files changed, 47 insertions(+), 37 deletions(-) diff --git a/unbiased/main.py b/unbiased/main.py index 88ceb7e..ea5412d 100755 --- a/unbiased/main.py +++ b/unbiased/main.py @@ -1,12 +1,20 @@ #!/usr/bin/env python3 import argparse +import logging import time from unbiased.unbiasedObjects import * from unbiased.unbiasedFunctions import * from unbiased.parser import * +logger = logging.getLogger('unbiased') +logger.setLevel(logging.DEBUG) +ch = logging.StreamHandler() +ch.setLevel(logging.DEBUG) +ch.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(message)s')) +logger.addHandler(ch) + def main(): parser = argparse.ArgumentParser() @@ -15,9 +23,9 @@ def main(): args = parser.parse_args() while True: - print('-----------------------') + logger.info('Starting crawl') run(args.webroot, args.scratch) - print('-----------------------') + logger.info('Crawl complete. Sleeping for 600s') time.sleep(600) def run(webroot, scratch): @@ -32,8 +40,8 @@ def run(webroot, scratch): ''' - print('running with webroot="{}"'.format(webroot)) - print('running with scratch="{}"'.format(scratch)) + logger.debug('Running with webroot="{}"'.format(webroot)) + logger.debug('Running with scratch="{}"'.format(scratch)) ### These values have to be the second half of the function name @@ -53,8 +61,7 @@ def run(webroot, scratch): sourceList.append(src) break except Exception as ex: - print(ex) - print('Build error. Looping again: '+source) + logger.error('Build error. Looping again. source={} ex={}'.format(source, ex)) tries+=1 time.sleep(tries) diff --git a/unbiased/parser.py b/unbiased/parser.py index f068ae8..2bba27d 100755 --- a/unbiased/parser.py +++ b/unbiased/parser.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 +import logging import os import re import subprocess @@ -7,6 +8,8 @@ import subprocess from unbiased.unbiasedObjects import * from unbiased.unbiasedFunctions import buildArticle +logger = logging.getLogger('unbiased') + ''' Takes in a URL, downloads the file to a temp file, @@ -39,7 +42,7 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir): h1Arr=[] a=buildArticle(h1URLs[0], name, scratchDir) if a==None: - print('................\nH1 Nonetype in '+name+'\n................') + logger.debug('H1 Nonetype in '+name) else: h1Arr.append(a) @@ -49,7 +52,7 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir): if a!=None: h2Arr.append(a) else: - print('................\nH2 Nonetype in '+name+'\n................') + logger.debug('H2 Nonetype in '+name) h3Arr=[] @@ -58,7 +61,7 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir): if a!=None: h3Arr.append(a) else: - print('................\nH3 Nonetype in '+name+'\n................') + logger.debug('H3 Nonetype in '+name) #BUILD THE NEWS SOURCE newsSource=NewsSource2(name, url, h1Arr, h2Arr, h3Arr) @@ -119,13 +122,11 @@ def removeDuplicates(h1s, h2s, h3s): def removalNotification(source, title, reason, value): - print('*************************') - print('\t\tSTORY REMOVED') - print('SOURCE: '+source) - print('TITLE: \t'+title) - print('REASON: '+reason) - print('VALUE: \t'+value) - print('*************************\n\n') + logger.debug("""Story removed + SOURCE:\t{} + TITLE:\t{}) + REASON:\t{} + VALUE:\t{}""".format(source, title, reason, value)) def removeBadStoriesHelper(source, element, badStringList, arr): @@ -133,7 +134,7 @@ def removeBadStoriesHelper(source, element, badStringList, arr): for i in range(len(arr)): for hed in arr[i]: if hed==None: - print("////////\nNone type found in removeBadStoriesHelper for "+source.name+"\n/////////") + logger.debug("None type found in removeBadStoriesHelper for "+source.name) break for item in badStringList: if item in getattr(hed, element): @@ -225,7 +226,7 @@ def buildGuardian(scratchDir): if h1!='https://www.theguardian.com/us': break else: - print('Guardian loop') + logger.debug('Guardian loop') h1s=[h1] diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py index 16ea07d..775346f 100644 --- a/unbiased/unbiasedFunctions.py +++ b/unbiased/unbiasedFunctions.py @@ -1,3 +1,4 @@ +import logging import os import pkgutil import random @@ -9,15 +10,15 @@ from unbiased.unbiasedObjects import * from PIL import Image +logger = logging.getLogger('unbiased') #take in a url and delimiters, return twitter card def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd): debugging=False if debugging: - print(sourceName) - print(url) - print() + logger.debug(sourceName) + logger.debug(url) temp_article = os.path.join(scratchDir, 'temp_article.html') @@ -60,7 +61,7 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t img=img[:-1] if debugging: - print(img) + logger.debug(img) title=content.split('og:title" content=')[1][1:].split('>')[0] if title[-1]=='/': @@ -68,7 +69,7 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t title=title[:-1] if debugging: - print(title) + logger.debug(title) author='' @@ -90,7 +91,7 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t break if debugging: - print(author) + logger.debug(author) if 'og:description' in content: @@ -104,7 +105,7 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t description=re.sub('<[^<]+?>', '', description) description=description[1:200] else: - print("SHOULDN'T GET HERE") + logger.debug("SHOULDN'T GET HERE") #strip out self-references description=description.replace(sourceName+"'s", '***') @@ -112,18 +113,16 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t description=description.replace(sourceName, '***') if debugging: - print(description) + logger.debug(description) a=Article(title, url, img, description, sourceName, author) return a except Exception: - print('^^^^^^^^^^^^^^^^^^^^^^^^^') - print('\tARTICLE PARSING ERROR') - print('SOURCE: '+sourceName) - print('URL: \t'+url) - print('^^^^^^^^^^^^^^^^^^^^^^^^^ \n\n') + logger.error("""ARTICLE PARSING ERROR + SOURCE:\t{} + URL:\t{}""".format(sourceName, url)) return None @@ -144,7 +143,7 @@ def buildOutput(newsSourceArr, webroot): if x not in h1RandomSources: h1RandomSources.append(x) else: - print('\n\n@@@@\nNo H1 stories in '+newsSourceArr[x].name+'\n@@@@\n\n') + logger.debug('No H1 stories in '+newsSourceArr[x].name) #For h2s and h3s, select N random sources (can repeat), then #a non-repetitive random article from within @@ -157,19 +156,18 @@ def buildOutput(newsSourceArr, webroot): if not pair in h2RandomPairs: h2RandomPairs.append(pair) else: - print('\n\n@@@@\nNo H2 stories in '+newsSourceArr[x].name+'\n@@@@\n\n') + logger.debug('No H2 stories in '+newsSourceArr[x].name) h3RandomPairs=[] while len(h3RandomPairs) < 12: x=random.sample(range(len(newsSourceArr)), 1)[0] - print(newsSourceArr[x].name) if len(newsSourceArr[x].h3Arr) > 0: y=random.sample(range(len(newsSourceArr[x].h3Arr)), 1)[0] pair=[x,y] if not pair in h3RandomPairs: h3RandomPairs.append(pair) else: - print('\n\n@@@@\nNo H3 stories in '+newsSourceArr[x].name+'\n@@@@\n\n') + logger.debug('No H3 stories in '+newsSourceArr[x].name) # collect articles for each section image_index = 0 @@ -203,7 +201,7 @@ def buildOutput(newsSourceArr, webroot): for i in range(len(newsSourceArr)-1): sourcesStr+=newsSourceArr[i].name+', ' sourcesStr+=newsSourceArr[-1].name - print('Successfully parsed: '+sourcesStr) + logger.info('Successfully parsed: '+sourcesStr) timestamp=time.strftime("%a, %b %-d, %-I:%M%P %Z", time.localtime()) diff --git a/unbiased/unbiasedObjects.py b/unbiased/unbiasedObjects.py index 3affbe6..9372d3a 100644 --- a/unbiased/unbiasedObjects.py +++ b/unbiased/unbiasedObjects.py @@ -1,3 +1,7 @@ +import logging + +logger = logging.getLogger('unbiased') + class Article(): title='' url='' @@ -86,5 +90,5 @@ class NewsSource(): elif level==3: self.h3Arr.append(article) else: - print("Error: invalid level in NewsSource.addArtlce: ", level) + logger.error("Invalid level in NewsSource.addArtlce: " + level) -- cgit v1.2.3 From 48471019c86d9a78a742b282b1b25df6d69c5752 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Wed, 19 Apr 2017 11:02:24 -0400 Subject: fix guardian images and image scaling --- unbiased/html_template/unbiased.css | 4 ++-- unbiased/main.py | 12 ++++++++---- unbiased/unbiasedFunctions.py | 32 ++++++++++++++++++++++++++------ 3 files changed, 36 insertions(+), 12 deletions(-) diff --git a/unbiased/html_template/unbiased.css b/unbiased/html_template/unbiased.css index 244f100..6817cc9 100755 --- a/unbiased/html_template/unbiased.css +++ b/unbiased/html_template/unbiased.css @@ -84,7 +84,7 @@ a:hover{ vertical-align:top; text-align:left; width:360px; - height:auto; + height:350px; overflow:hidden; background:#fff; margin:10px; @@ -217,4 +217,4 @@ a:hover{ clear:both; padding-top:4em; font-size:.8em; -} \ No newline at end of file +} diff --git a/unbiased/main.py b/unbiased/main.py index ea5412d..87b1e8c 100755 --- a/unbiased/main.py +++ b/unbiased/main.py @@ -48,10 +48,12 @@ def run(webroot, scratch): ### E.g. Guardian calls buildGuardian(), etc. sourceFnArr=['Guardian', 'TheHill', 'NPR', 'BBC', 'NBC', 'CBS', 'FoxNews', 'WashTimes', 'CSM', 'ABC'] #'Blaze' - + for source in sourceFnArr: + logger.info('Crawling {}'.format(source)) tries=0 while tries<3: + time.sleep(tries) try: fn='build'+source possibles = globals().copy() @@ -61,10 +63,12 @@ def run(webroot, scratch): sourceList.append(src) break except Exception as ex: - logger.error('Build error. Looping again. source={} ex={}'.format(source, ex)) tries+=1 - time.sleep(tries) - + if tries == 3: + logger.error('Build failed. source={} ex={}'.format(source, ex)) + else: + logger.debug('Build failed, retrying. source={} ex={}'.format(source, ex)) + #scrape all urls and build data structure newsSourceArr=buildNewsSourceArr(sourceList, scratch) diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py index 775346f..fdf9d8f 100644 --- a/unbiased/unbiasedFunctions.py +++ b/unbiased/unbiasedFunctions.py @@ -1,3 +1,4 @@ +import html import logging import os import pkgutil @@ -32,7 +33,7 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t f.close() try: - if sourceName=='The Guardian': + if sourceName=='The Guardian US': #The Guardian puts an identifying banner on their og:images #grab the main image from the page instead @@ -48,14 +49,15 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t elif '')[0] elif sourceName=='ABC News': img='https://c1.staticflickr.com/7/6042/6276688407_12900948a2_b.jpgX' if img[-1]=='/': - #because the quote separator could be ' or ", + #because the quote separator could be ' or ", #trim to just before it then lop it off img=img[:-1].strip() img=img[:-1] @@ -282,16 +284,34 @@ def buildNewsSourceArr(sourceList, scratchDir): #since everything should have been modified in place return sourceList -def pullImage(url, index, webroot, width=350, height=200): +def pullImage(url, index, webroot, target_width=350, target_height=200): extension = url.split('.')[-1].split('?')[0] img_name = 'img{}.{}'.format(index, extension) out_file = os.path.join(webroot, img_name) try: subprocess.check_call(['wget', '-q', '-O', out_file, '--no-check-certificate', url]) - except Exception: + except Exception as ex: + logger.error('Failed to pull image: url={} ex={}'.format(url, ex)) return '' img = Image.open(out_file) - img.resize((width, height)) + # crop to aspect ratio + target_ar = target_width / target_height + left, top, right, bottom = img.getbbox() + height = bottom - top + width = right - left + ar = width / height + if target_ar > ar: + new_height = (target_height / target_width) * width + bbox = (left, top + ((height - new_height) / 2), right, bottom - ((height - new_height) / 2)) + img = img.crop(bbox) + elif target_ar < ar: + new_width = (target_width / target_height) * height + bbox = (left + ((width - new_width) / 2), top, right - ((width - new_width) / 2), bottom) + img = img.crop(bbox) + # resize if larger + if target_width * 2 < width or target_height * 2 < height: + img = img.resize((target_width*2, target_height*2), Image.LANCZOS) + # TODO: create retina images jpg_name = 'img{}.jpg'.format(index) img.save(os.path.join(webroot, jpg_name), 'JPEG') return jpg_name -- cgit v1.2.3 From 761f5d564bf3d60acdeb5581d687c0c8c4b22a69 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Wed, 19 Apr 2017 13:22:38 -0400 Subject: add favicons and write intermediate images to scratch --- unbiased/html_template/apple-touch-icon.png | Bin 0 -> 7036 bytes unbiased/html_template/favicon.ico | Bin 0 -> 4414 bytes unbiased/html_template/favicon.png | Bin 0 -> 1093 bytes unbiased/html_template/unbiased.jinja.html | 3 +++ unbiased/main.py | 2 +- unbiased/unbiasedFunctions.py | 37 +++++++++++++++------------- 6 files changed, 24 insertions(+), 18 deletions(-) create mode 100644 unbiased/html_template/apple-touch-icon.png create mode 100644 unbiased/html_template/favicon.ico create mode 100644 unbiased/html_template/favicon.png diff --git a/unbiased/html_template/apple-touch-icon.png b/unbiased/html_template/apple-touch-icon.png new file mode 100644 index 0000000..93c33aa Binary files /dev/null and b/unbiased/html_template/apple-touch-icon.png differ diff --git a/unbiased/html_template/favicon.ico b/unbiased/html_template/favicon.ico new file mode 100644 index 0000000..b2b29c6 Binary files /dev/null and b/unbiased/html_template/favicon.ico differ diff --git a/unbiased/html_template/favicon.png b/unbiased/html_template/favicon.png new file mode 100644 index 0000000..0b94313 Binary files /dev/null and b/unbiased/html_template/favicon.png differ diff --git a/unbiased/html_template/unbiased.jinja.html b/unbiased/html_template/unbiased.jinja.html index 778bebc..40c9582 100644 --- a/unbiased/html_template/unbiased.jinja.html +++ b/unbiased/html_template/unbiased.jinja.html @@ -4,6 +4,9 @@ + + + UnBiased diff --git a/unbiased/main.py b/unbiased/main.py index 87b1e8c..f784bce 100755 --- a/unbiased/main.py +++ b/unbiased/main.py @@ -73,7 +73,7 @@ def run(webroot, scratch): newsSourceArr=buildNewsSourceArr(sourceList, scratch) #build the output file HTML - outputHTML=buildOutput(newsSourceArr, webroot) + outputHTML=buildOutput(newsSourceArr, webroot, scratch) #print the output file HTML printOutputHTML(outputHTML, webroot) diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py index fdf9d8f..415a3cc 100644 --- a/unbiased/unbiasedFunctions.py +++ b/unbiased/unbiasedFunctions.py @@ -128,7 +128,7 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t return None -def buildOutput(newsSourceArr, webroot): +def buildOutput(newsSourceArr, webroot, scratch): #read in the template html file from jinja2 import Environment, PackageLoader, select_autoescape env = Environment( @@ -179,7 +179,7 @@ def buildOutput(newsSourceArr, webroot): source=newsSourceArr[h1RandomSources[i]] randomArticle=random.sample(range(len(source.h1Arr)), 1)[0] article=source.h1Arr[randomArticle] - img_name = pullImage(article.img, image_index, webroot, 350, 200) + img_name = pullImage(article.img, image_index, webroot, scratch, 350, 200) image_index += 1 article.img = img_name top_stories.append(article) @@ -188,7 +188,7 @@ def buildOutput(newsSourceArr, webroot): for i in range(len(h2RandomPairs)): pair=h2RandomPairs[i] article=newsSourceArr[pair[0]].h2Arr[pair[1]] - img_name = pullImage(article.img, image_index, webroot, 150, 100) + img_name = pullImage(article.img, image_index, webroot, scratch, 150, 100) image_index += 1 article.img = img_name middle_stories.append(article) @@ -226,11 +226,11 @@ def printOutputHTML(outputHTML, outDir): with open(os.path.join(outDir, 'index.html'), 'w') as fp: fp.write(outputHTML) - # copy over the template css file - css = pkgutil.get_data('unbiased', 'html_template/unbiased.css') - css = css.decode('utf8') - with open(os.path.join(outDir, 'unbiased.css'), 'w') as fp: - fp.write(css) + # copy over static package files + for filename in ['unbiased.css', 'favicon.ico', 'favicon.png', 'apple-touch-icon.png']: + data = pkgutil.get_data('unbiased', os.path.join('html_template', filename)) + with open(os.path.join(outDir, filename), 'wb') as fp: + fp.write(data) def buildNewsSourceArr(sourceList, scratchDir): @@ -256,13 +256,13 @@ def buildNewsSourceArr(sourceList, scratchDir): f=open(temp_file, 'r', encoding="utf8") content=f.read() f.close() - + #delete file MAYBE DON'T DO THIS? CAUSES OS ERRORS #os.remove(temp_file) #add stories etc to the NewsSource object h1s, h2s, h3s=extractURLs(content, source) - + #build the Article objects and add to newsSource's appropriate list if h1s!=None and h2s!=None: for url in h1s: @@ -279,21 +279,21 @@ def buildNewsSourceArr(sourceList, scratchDir): sourceList.remove(source) listLen-=1 - + #return the original sourceList, #since everything should have been modified in place - return sourceList + return sourceList -def pullImage(url, index, webroot, target_width=350, target_height=200): +def pullImage(url, index, webroot, scratch, target_width=350, target_height=200): extension = url.split('.')[-1].split('?')[0] img_name = 'img{}.{}'.format(index, extension) - out_file = os.path.join(webroot, img_name) + tmp_file = os.path.join(scratch, img_name) try: - subprocess.check_call(['wget', '-q', '-O', out_file, '--no-check-certificate', url]) + subprocess.check_call(['wget', '-q', '-O', tmp_file, '--no-check-certificate', url]) except Exception as ex: logger.error('Failed to pull image: url={} ex={}'.format(url, ex)) return '' - img = Image.open(out_file) + img = Image.open(tmp_file) # crop to aspect ratio target_ar = target_width / target_height left, top, right, bottom = img.getbbox() @@ -313,5 +313,8 @@ def pullImage(url, index, webroot, target_width=350, target_height=200): img = img.resize((target_width*2, target_height*2), Image.LANCZOS) # TODO: create retina images jpg_name = 'img{}.jpg'.format(index) - img.save(os.path.join(webroot, jpg_name), 'JPEG') + out_file = os.path.join(webroot, jpg_name) + img.save(out_file, 'JPEG') + if tmp_file != out_file: + os.remove(tmp_file) return jpg_name -- cgit v1.2.3 From 4a8cb231c3974d3f2000e170ca0c56850bc66c7f Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Wed, 19 Apr 2017 13:53:33 -0400 Subject: more consistent crawl frequency --- unbiased/main.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/unbiased/main.py b/unbiased/main.py index f784bce..c8a113e 100755 --- a/unbiased/main.py +++ b/unbiased/main.py @@ -22,11 +22,17 @@ def main(): parser.add_argument('-s', '--scratch', default='/opt/unbiased/scratch', help='writable scratch workspace') args = parser.parse_args() + crawl_frequency = 600 while True: logger.info('Starting crawl') + start = time.time() run(args.webroot, args.scratch) - logger.info('Crawl complete. Sleeping for 600s') - time.sleep(600) + finish = time.time() + runtime = finish - start + sleeptime = crawl_frequency - runtime + logger.info('Crawl complete in {}s. Sleeping for {}s'.format(int(runtime), int(sleeptime))) + if sleeptime > 0: + time.sleep(sleeptime) def run(webroot, scratch): sourceList=[] -- cgit v1.2.3 From 7a8efb94dc2463a6d30afc77f10df78ebfa4c353 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Wed, 19 Apr 2017 16:39:03 -0400 Subject: replace wget with requests library --- setup.py | 1 + unbiased/html_template/unbiased.css | 8 +-- unbiased/main.py | 2 +- unbiased/parser.py | 29 ++++++----- unbiased/unbiasedFunctions.py | 98 +++++++++++-------------------------- 5 files changed, 49 insertions(+), 89 deletions(-) diff --git a/setup.py b/setup.py index 2755304..57c27c0 100644 --- a/setup.py +++ b/setup.py @@ -13,6 +13,7 @@ setup( install_requires=[ 'jinja2', 'Pillow', + 'requests', ], entry_points={ 'console_scripts': [ diff --git a/unbiased/html_template/unbiased.css b/unbiased/html_template/unbiased.css index 6817cc9..1424ee9 100755 --- a/unbiased/html_template/unbiased.css +++ b/unbiased/html_template/unbiased.css @@ -108,8 +108,8 @@ a:hover{ width:350px; height:200px; overflow:hidden; - background-size: auto 234px;/*cover;*/ - background-position: top center;/*center center;*/ + background-size: 100%; + background-position: center center; margin:0 auto; } @@ -169,8 +169,8 @@ a:hover{ width:150px; height:100px; overflow:hidden; - background-size: auto 117px;/*cover;*/ - background-position: top center;/*center center;*/ + background-size: 100%; + background-position: center center; float:left; max-width:35%; } diff --git a/unbiased/main.py b/unbiased/main.py index c8a113e..c760788 100755 --- a/unbiased/main.py +++ b/unbiased/main.py @@ -76,7 +76,7 @@ def run(webroot, scratch): logger.debug('Build failed, retrying. source={} ex={}'.format(source, ex)) #scrape all urls and build data structure - newsSourceArr=buildNewsSourceArr(sourceList, scratch) + newsSourceArr = sourceList #build the output file HTML outputHTML=buildOutput(newsSourceArr, webroot, scratch) diff --git a/unbiased/parser.py b/unbiased/parser.py index 2bba27d..0a8398c 100755 --- a/unbiased/parser.py +++ b/unbiased/parser.py @@ -4,6 +4,9 @@ import logging import os import re import subprocess +import urllib.parse + +import requests from unbiased.unbiasedObjects import * from unbiased.unbiasedFunctions import buildArticle @@ -16,21 +19,11 @@ Takes in a URL, downloads the file to a temp file, reads the file into a string, and returns that string ''' def urlToContent(url, scratchDir, sourceEncoding='utf8'): - temp_file = os.path.join(scratchDir, 'temp1.html') - - #download file - #os.system('wget -q -O scratch/temp1.html --no-check-certificate '+url) - subprocess.check_call(['wget', '-q', '-O', temp_file, '--no-check-certificate', url]) - - #read file - if sourceEncoding=='utf8': - f=open(temp_file, 'r', encoding="utf8") + res = requests.get(url) + if res.status_code == 200: + return res.text else: - f=open(temp_file, 'r', encoding="latin-1") - content=f.read() - f.close() - - return content + raise Exception("Failed to download {}".format(url)) ''' @@ -39,6 +32,13 @@ calls the file scraper and appends the new Article object. Returns a newsSource2 object ''' def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir): + + url_parts = urllib.parse.urlparse(url) + scheme = url_parts.scheme + h1URLs = [urllib.parse.urlparse(x, scheme=scheme).geturl() for x in h1URLs] + h2URLs = [urllib.parse.urlparse(x, scheme=scheme).geturl() for x in h2URLs] + h3URLs = [urllib.parse.urlparse(x, scheme=scheme).geturl() for x in h3URLs] + h1Arr=[] a=buildArticle(h1URLs[0], name, scratchDir) if a==None: @@ -54,7 +54,6 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir): else: logger.debug('H2 Nonetype in '+name) - h3Arr=[] for x in h3URLs: a=buildArticle(x, name, scratchDir) diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py index 415a3cc..0181beb 100644 --- a/unbiased/unbiasedFunctions.py +++ b/unbiased/unbiasedFunctions.py @@ -1,4 +1,5 @@ import html +import io import logging import os import pkgutil @@ -6,10 +7,12 @@ import random import re import subprocess import time - -from unbiased.unbiasedObjects import * +import urllib.parse from PIL import Image +import requests + +from unbiased.unbiasedObjects import * logger = logging.getLogger('unbiased') @@ -21,16 +24,25 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t logger.debug(sourceName) logger.debug(url) - temp_article = os.path.join(scratchDir, 'temp_article.html') + url_parts = urllib.parse.urlparse(url) + scheme = url_parts.scheme #download url - #os.system('wget -q -O scratch/temp_article.html --no-check-certificate '+url) - subprocess.check_call(['wget', '-q', '-O', temp_article, '--no-check-certificate', url]) + try: + res = requests.get(url) + except Exception as ex: + logger.error("""ARTICLE DOWNLOADING ERROR + SOURCE:\t{} + URL:\t{}""".format(sourceName, url)) + return None - #read the file in - f=open(temp_article, 'r', encoding="utf8") - content=f.read() - f.close() + if res.status_code == 200: + content = res.text + else: + logger.error("""ARTICLE DOWNLOADING ERROR + SOURCE:\t{} + URL:\t{}""".format(sourceName, url)) + return None try: if sourceName=='The Guardian US': @@ -61,6 +73,8 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t #trim to just before it then lop it off img=img[:-1].strip() img=img[:-1] + # fix the scheme if it's missing + img = urllib.parse.urlparse(img, scheme=scheme).geturl() if debugging: logger.debug(img) @@ -232,68 +246,16 @@ def printOutputHTML(outputHTML, outDir): with open(os.path.join(outDir, filename), 'wb') as fp: fp.write(data) -def buildNewsSourceArr(sourceList, scratchDir): - - #build the data structure - i=0 - listLen=len(sourceList) - while i < listLen: - source=sourceList[i] - - if type(source) is NewsSource2: - i+=1 - continue - - url=source.url - - temp_file = os.path.join(scratchDir, 'temp{}.html'.format(i)) - - #download file - #os.system('wget -q -O scratch/temp'+str(i)+'.html --no-check-certificate '+url) - subprocess.check_call(['wget', '-q', '-O', temp_file, '--no-check-certificate', url]) - - #read file - f=open(temp_file, 'r', encoding="utf8") - content=f.read() - f.close() - - #delete file MAYBE DON'T DO THIS? CAUSES OS ERRORS - #os.remove(temp_file) - - #add stories etc to the NewsSource object - h1s, h2s, h3s=extractURLs(content, source) - - #build the Article objects and add to newsSource's appropriate list - if h1s!=None and h2s!=None: - for url in h1s: - article=buildArticle(url, source.name, scratchDir) - if article!=None: source.addArticle(article, 1) #sourceList[i].h1Arr.append(article) - for url in h2s: - article=buildArticle(url, source.name, scratchDir) - if article!=None: sourceList[i].h2Arr.append(article) - for url in h3s: - article=buildArticle(url, source.name, scratchDir) - if article!=None: sourceList[i].h3Arr.append(article) - i+=1 - else: - sourceList.remove(source) - listLen-=1 - - - #return the original sourceList, - #since everything should have been modified in place - return sourceList - def pullImage(url, index, webroot, scratch, target_width=350, target_height=200): extension = url.split('.')[-1].split('?')[0] img_name = 'img{}.{}'.format(index, extension) - tmp_file = os.path.join(scratch, img_name) - try: - subprocess.check_call(['wget', '-q', '-O', tmp_file, '--no-check-certificate', url]) - except Exception as ex: - logger.error('Failed to pull image: url={} ex={}'.format(url, ex)) + res = requests.get(url) + if res.status_code == 200: + content = res.content + else: + logger.error('Image not found: url={}'.format(url)) return '' - img = Image.open(tmp_file) + img = Image.open(io.BytesIO(content)) # crop to aspect ratio target_ar = target_width / target_height left, top, right, bottom = img.getbbox() @@ -315,6 +277,4 @@ def pullImage(url, index, webroot, scratch, target_width=350, target_height=200) jpg_name = 'img{}.jpg'.format(index) out_file = os.path.join(webroot, jpg_name) img.save(out_file, 'JPEG') - if tmp_file != out_file: - os.remove(tmp_file) return jpg_name -- cgit v1.2.3 From 8dffc67fae2c5a6cc1fe125809e0b74d8b4b28f3 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Wed, 19 Apr 2017 16:47:30 -0400 Subject: don't need a scratch directory any more --- unbiased/main.py | 10 ++--- unbiased/parser.py | 91 +++++++++++++++++++++---------------------- unbiased/unbiasedFunctions.py | 11 +++--- 3 files changed, 54 insertions(+), 58 deletions(-) diff --git a/unbiased/main.py b/unbiased/main.py index c760788..60211ea 100755 --- a/unbiased/main.py +++ b/unbiased/main.py @@ -19,14 +19,13 @@ logger.addHandler(ch) def main(): parser = argparse.ArgumentParser() parser.add_argument('-w', '--webroot', default='/var/www/ubiased', help='location to write the output html') - parser.add_argument('-s', '--scratch', default='/opt/unbiased/scratch', help='writable scratch workspace') args = parser.parse_args() crawl_frequency = 600 while True: logger.info('Starting crawl') start = time.time() - run(args.webroot, args.scratch) + run(args.webroot) finish = time.time() runtime = finish - start sleeptime = crawl_frequency - runtime @@ -34,7 +33,7 @@ def main(): if sleeptime > 0: time.sleep(sleeptime) -def run(webroot, scratch): +def run(webroot): sourceList=[] ''' @@ -47,7 +46,6 @@ def run(webroot, scratch): ''' logger.debug('Running with webroot="{}"'.format(webroot)) - logger.debug('Running with scratch="{}"'.format(scratch)) ### These values have to be the second half of the function name @@ -65,7 +63,7 @@ def run(webroot, scratch): possibles = globals().copy() possibles.update(locals()) method = possibles.get(fn) - src=method(scratch) + src=method() sourceList.append(src) break except Exception as ex: @@ -79,7 +77,7 @@ def run(webroot, scratch): newsSourceArr = sourceList #build the output file HTML - outputHTML=buildOutput(newsSourceArr, webroot, scratch) + outputHTML=buildOutput(newsSourceArr, webroot) #print the output file HTML printOutputHTML(outputHTML, webroot) diff --git a/unbiased/parser.py b/unbiased/parser.py index 0a8398c..41727f5 100755 --- a/unbiased/parser.py +++ b/unbiased/parser.py @@ -3,7 +3,6 @@ import logging import os import re -import subprocess import urllib.parse import requests @@ -18,7 +17,7 @@ logger = logging.getLogger('unbiased') Takes in a URL, downloads the file to a temp file, reads the file into a string, and returns that string ''' -def urlToContent(url, scratchDir, sourceEncoding='utf8'): +def urlToContent(url, sourceEncoding='utf8'): res = requests.get(url) if res.status_code == 200: return res.text @@ -31,7 +30,7 @@ Creates a new newsSource2 object. For each URL in h1-h3URLs, calls the file scraper and appends the new Article object. Returns a newsSource2 object ''' -def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir): +def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs): url_parts = urllib.parse.urlparse(url) scheme = url_parts.scheme @@ -40,7 +39,7 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir): h3URLs = [urllib.parse.urlparse(x, scheme=scheme).geturl() for x in h3URLs] h1Arr=[] - a=buildArticle(h1URLs[0], name, scratchDir) + a=buildArticle(h1URLs[0], name) if a==None: logger.debug('H1 Nonetype in '+name) else: @@ -48,7 +47,7 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir): h2Arr=[] for x in h2URLs: - a=buildArticle(x, name, scratchDir) + a=buildArticle(x, name) if a!=None: h2Arr.append(a) else: @@ -56,7 +55,7 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir): h3Arr=[] for x in h3URLs: - a=buildArticle(x, name, scratchDir) + a=buildArticle(x, name) if a!=None: h3Arr.append(a) else: @@ -161,12 +160,12 @@ def removeBadStories(source, badTitleArr, badDescArr, badAuthorArr, badImgArr, b -def buildTheHill(scratchDir): +def buildTheHill(): url='http://thehill.com' name='The Hill' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url, scratchDir) + content=urlToContent(url) #get main headline h1=content @@ -198,7 +197,7 @@ def buildTheHill(scratchDir): h3s.append(url+x) h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - hil=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) + hil=buildNewsSource2(name, url, h1s, h2s, h3s) hil=removeBadStories(hil, ['THE MEMO'], None, ['Matt Schlapp', 'Juan Williams', 'Judd Gregg'], None, None) return hil @@ -207,14 +206,14 @@ def buildTheHill(scratchDir): -def buildGuardian(scratchDir): +def buildGuardian(): url='http://www.theguardian.com/us' name='The Guardian US' while True: #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url, scratchDir, 'utf8') + content=urlToContent(url, 'utf8') #get main headline h1=content @@ -256,20 +255,20 @@ def buildGuardian(scratchDir): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - gdn=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) + gdn=buildNewsSource2(name, url, h1s, h2s, h3s) gdn=removeBadStories(gdn, None, ['Tom McCarthy', 'Andy Hunter'], ['https://www.theguardian.com/profile/ben-jacobs'], None) return gdn -def buildWashTimes(scratchDir): +def buildWashTimes(): url='http://www.washingtontimes.com/' name='Washington Times' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url, scratchDir) + content=urlToContent(url) #get main headline h1=content @@ -305,19 +304,19 @@ def buildWashTimes(scratchDir): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - wat=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) + wat=buildNewsSource2(name, url, h1s, h2s, h3s) wat=removeBadStories(wat, None, None, None, None) return wat -def buildCSM(scratchDir): +def buildCSM(): url='http://www.csmonitor.com/USA' name='Christian Science Monitor' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url, scratchDir) + content=urlToContent(url) #this makes sure we don't get '/USA' in the URL twice url=url.split('/USA')[0] @@ -368,7 +367,7 @@ def buildCSM(scratchDir): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - csm=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) + csm=buildNewsSource2(name, url, h1s, h2s, h3s) badTitleArr=['Change Agent'] badDescArr=None @@ -388,7 +387,7 @@ in The Blaze articles by grabbing the first portion of the story instead def blazeFixDesc(articleArr): TAG_RE = re.compile(r'<[^>]+>') for i in range(len(articleArr)): - desc=urlToContent(articleArr[i].url, scratchDir) + desc=urlToContent(articleArr[i].url) desc=desc.split('
    ', 1)[1] desc=desc.split('

    ', 1)[1] desc=TAG_RE.sub('', desc) @@ -400,12 +399,12 @@ def blazeFixDesc(articleArr): -def buildBlaze(scratchDir): +def buildBlaze(): url='http://theblaze.com' name='The Blaze' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url, scratchDir) + content=urlToContent(url) #get main headline h1=content @@ -439,7 +438,7 @@ def buildBlaze(scratchDir): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - blz=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) + blz=buildNewsSource2(name, url, h1s, h2s, h3s) badTitleArr=['Tucker Carlson', 'Mark Levin'] badDescArr=['Lawrence Jones', 'Mike Slater'] @@ -459,12 +458,12 @@ def buildBlaze(scratchDir): -def buildCBS(scratchDir): +def buildCBS(): url='http://cbsnews.com' name='CBS News' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url, scratchDir) + content=urlToContent(url) #get main headline h1=content @@ -508,7 +507,7 @@ def buildCBS(scratchDir): h3s.append(url+x) h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - cbs=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) + cbs=buildNewsSource2(name, url, h1s, h2s, h3s) cbs=removeBadStories(cbs, ['60 Minutes'], ['60 Minutes'], None, None, ['whats-in-the-news-coverart']) return cbs @@ -517,12 +516,12 @@ def buildCBS(scratchDir): -def buildNBC(scratchDir): +def buildNBC(): url='http://nbcnews.com' name='NBC News' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url, scratchDir) + content=urlToContent(url) #get main headline h1=content @@ -571,7 +570,7 @@ def buildNBC(scratchDir): ''' h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - nbc=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) + nbc=buildNewsSource2(name, url, h1s, h2s, h3s) nbc=removeBadStories(nbc, None, ['First Read'], None, None, None) @@ -580,12 +579,12 @@ def buildNBC(scratchDir): -def buildBBC(scratchDir): +def buildBBC(): url='http://www.bbc.com/news/world/us_and_canada' name='BBC US & Canada' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url, scratchDir) + content=urlToContent(url) #get main headline h1=content @@ -619,7 +618,7 @@ def buildBBC(scratchDir): h3s.append('http://www.bbc.com'+x) h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - bbc=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) + bbc=buildNewsSource2(name, url, h1s, h2s, h3s) badTitleArr=None badDescArr=None badAuthorArr=None @@ -642,12 +641,12 @@ def buildBBC(scratchDir): -def buildWeeklyStandard(scratchDir): +def buildWeeklyStandard(): url='http://www.weeklystandard.com' name='Weekly Standard' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url, scratchDir) + content=urlToContent(url) #get main headline h1=content @@ -692,7 +691,7 @@ def buildWeeklyStandard(scratchDir): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - wkl=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) + wkl=buildNewsSource2(name, url, h1s, h2s, h3s) #REMOVE BAD STORIES badTitleArr=None @@ -707,12 +706,12 @@ def buildWeeklyStandard(scratchDir): -def buildNPR(scratchDir): +def buildNPR(): url='http://www.npr.org/sections/news/' name='NPR' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url, scratchDir) + content=urlToContent(url) #get main headline h1=content @@ -746,7 +745,7 @@ def buildNPR(scratchDir): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - npr=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) + npr=buildNewsSource2(name, url, h1s, h2s, h3s) #REMOVE BAD STORIES badTitleArr=['The Two-Way'] @@ -761,12 +760,12 @@ def buildNPR(scratchDir): -def buildABC(scratchDir): +def buildABC(): url='http://www.abcnews.go.com' name='ABC News' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url, scratchDir) + content=urlToContent(url) #get main headline h1=content @@ -800,7 +799,7 @@ def buildABC(scratchDir): h3s.append(x) h1s, h2s, h3s = removeDuplicates([h1], h2s, h3s) - abc=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) + abc=buildNewsSource2(name, url, h1s, h2s, h3s) #REMOVE BAD STORIES badTitleArr=None @@ -815,12 +814,12 @@ def buildABC(scratchDir): -def buildFoxNews(scratchDir): +def buildFoxNews(): url='http://foxnews.com' name='Fox News' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url, scratchDir) + content=urlToContent(url) #get main headline h1=content @@ -854,7 +853,7 @@ def buildFoxNews(scratchDir): h3s = ['http:' + x if x.startswith('//') else x for x in h3s] h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - fox=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) + fox=buildNewsSource2(name, url, h1s, h2s, h3s) #REMOVE BAD STORIES badTitleArr=['O'Reilly', 'Fox News', 'Brett Baier', 'Tucker'] @@ -868,12 +867,12 @@ def buildFoxNews(scratchDir): -def buildNYT(scratchDir): +def buildNYT(): url='http://www.nytimes.com' name='New York Times' #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url, scratchDir) + content=urlToContent(url) #get main headline #this will likely need if/else logic @@ -951,7 +950,7 @@ def buildNYT(scratchDir): h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - nyt=buildNewsSource2(name, url, h1s, h2s, h3s, scratchDir) + nyt=buildNewsSource2(name, url, h1s, h2s, h3s) nyt=removeBadStories(nyt, None, None, None, None, ['https://www.nytimes.com/section/magazine', 'https://www.nytimes.com/newsletters/the-interpreter']) diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py index 0181beb..76c80b0 100644 --- a/unbiased/unbiasedFunctions.py +++ b/unbiased/unbiasedFunctions.py @@ -5,7 +5,6 @@ import os import pkgutil import random import re -import subprocess import time import urllib.parse @@ -17,7 +16,7 @@ from unbiased.unbiasedObjects import * logger = logging.getLogger('unbiased') #take in a url and delimiters, return twitter card -def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd): +def buildArticle(url, sourceName, encoding=None):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd): debugging=False if debugging: @@ -142,7 +141,7 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t return None -def buildOutput(newsSourceArr, webroot, scratch): +def buildOutput(newsSourceArr, webroot): #read in the template html file from jinja2 import Environment, PackageLoader, select_autoescape env = Environment( @@ -193,7 +192,7 @@ def buildOutput(newsSourceArr, webroot, scratch): source=newsSourceArr[h1RandomSources[i]] randomArticle=random.sample(range(len(source.h1Arr)), 1)[0] article=source.h1Arr[randomArticle] - img_name = pullImage(article.img, image_index, webroot, scratch, 350, 200) + img_name = pullImage(article.img, image_index, webroot, 350, 200) image_index += 1 article.img = img_name top_stories.append(article) @@ -202,7 +201,7 @@ def buildOutput(newsSourceArr, webroot, scratch): for i in range(len(h2RandomPairs)): pair=h2RandomPairs[i] article=newsSourceArr[pair[0]].h2Arr[pair[1]] - img_name = pullImage(article.img, image_index, webroot, scratch, 150, 100) + img_name = pullImage(article.img, image_index, webroot, 150, 100) image_index += 1 article.img = img_name middle_stories.append(article) @@ -246,7 +245,7 @@ def printOutputHTML(outputHTML, outDir): with open(os.path.join(outDir, filename), 'wb') as fp: fp.write(data) -def pullImage(url, index, webroot, scratch, target_width=350, target_height=200): +def pullImage(url, index, webroot, target_width=350, target_height=200): extension = url.split('.')[-1].split('?')[0] img_name = 'img{}.{}'.format(index, extension) res = requests.get(url) -- cgit v1.2.3 From c5a75b89716eabcefd1fe4cb880ffd98669a48a6 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Wed, 19 Apr 2017 22:59:21 -0400 Subject: a bit of refactoring --- unbiased/main.py | 55 ++++++++++++++++++++++++++++--------------- unbiased/unbiasedFunctions.py | 39 ++++++++++++------------------ 2 files changed, 51 insertions(+), 43 deletions(-) diff --git a/unbiased/main.py b/unbiased/main.py index 60211ea..ba72710 100755 --- a/unbiased/main.py +++ b/unbiased/main.py @@ -34,54 +34,71 @@ def main(): time.sleep(sleeptime) def run(webroot): - sourceList=[] + sources = [] ''' - SOURCES TO ADD NEXT: - -ABC -REUTERS -Town Hall - ''' logger.debug('Running with webroot="{}"'.format(webroot)) - ### These values have to be the second half of the function name ### E.g. Guardian calls buildGuardian(), etc. - sourceFnArr=['Guardian', 'TheHill', 'NPR', 'BBC', 'NBC', 'CBS', - 'FoxNews', 'WashTimes', 'CSM', 'ABC'] #'Blaze' + sourceFnArr = [ + 'Guardian', + 'TheHill', + 'NPR', + 'BBC', + 'NBC', + 'CBS', + 'FoxNews', + 'WashTimes', + 'CSM', + 'ABC', + ] for source in sourceFnArr: logger.info('Crawling {}'.format(source)) - tries=0 - while tries<3: + tries = 0 + while tries < 3: time.sleep(tries) try: - fn='build'+source + fn = 'build' + source possibles = globals().copy() possibles.update(locals()) method = possibles.get(fn) - src=method() - sourceList.append(src) + src = method() + sources.append(src) break except Exception as ex: - tries+=1 + tries += 1 if tries == 3: logger.error('Build failed. source={} ex={}'.format(source, ex)) else: logger.debug('Build failed, retrying. source={} ex={}'.format(source, ex)) - - #scrape all urls and build data structure - newsSourceArr = sourceList + logger.info('Parsed home pages for: {}'.format([x.name for x in sources])) + + top_stories, middle_stories, bottom_stories = pickStories(sources) + logger.info('Picked top stories from: {}'.format([x.source for x in top_stories])) + logger.info('Picked middle stories from: {}'.format([x.source for x in middle_stories])) + logger.info('Picked bottom stories from: {}'.format([x.source for x in bottom_stories])) + + # download images + img_idx = 0 + for story in top_stories: + story.img = pullImage(story.img, img_idx, webroot, 350, 200) + img_idx += 1 + for story in middle_stories: + story.img = pullImage(story.img, img_idx, webroot, 150, 100) + img_idx += 1 #build the output file HTML - outputHTML=buildOutput(newsSourceArr, webroot) + outputHTML = buildOutput(top_stories, middle_stories, bottom_stories) #print the output file HTML - printOutputHTML(outputHTML, webroot) - + writeOutputHTML(outputHTML, webroot) if __name__=="__main__": main() diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py index 76c80b0..2053ba5 100644 --- a/unbiased/unbiasedFunctions.py +++ b/unbiased/unbiasedFunctions.py @@ -141,15 +141,7 @@ def buildArticle(url, sourceName, encoding=None):#, titleDelStart, titleDelEnd, return None -def buildOutput(newsSourceArr, webroot): - #read in the template html file - from jinja2 import Environment, PackageLoader, select_autoescape - env = Environment( - loader=PackageLoader('unbiased', 'html_template'), - autoescape=select_autoescape(['html', 'xml']) - ) - template = env.get_template('unbiased.jinja.html') - +def pickStories(newsSourceArr): #set the random order for sources h1RandomSources=[] while len(h1RandomSources)<4: @@ -192,18 +184,12 @@ def buildOutput(newsSourceArr, webroot): source=newsSourceArr[h1RandomSources[i]] randomArticle=random.sample(range(len(source.h1Arr)), 1)[0] article=source.h1Arr[randomArticle] - img_name = pullImage(article.img, image_index, webroot, 350, 200) - image_index += 1 - article.img = img_name top_stories.append(article) middle_stories = [] for i in range(len(h2RandomPairs)): pair=h2RandomPairs[i] article=newsSourceArr[pair[0]].h2Arr[pair[1]] - img_name = pullImage(article.img, image_index, webroot, 150, 100) - image_index += 1 - article.img = img_name middle_stories.append(article) bottom_stories = [] @@ -212,14 +198,21 @@ def buildOutput(newsSourceArr, webroot): article=newsSourceArr[pair[0]].h3Arr[pair[1]] bottom_stories.append(article) - sourcesStr='' - for i in range(len(newsSourceArr)-1): - sourcesStr+=newsSourceArr[i].name+', ' - sourcesStr+=newsSourceArr[-1].name - logger.info('Successfully parsed: '+sourcesStr) + return top_stories, middle_stories, bottom_stories + +def buildOutput(top_stories, middle_stories, bottom_stories): + #read in the template html file + from jinja2 import Environment, PackageLoader, select_autoescape + env = Environment( + loader=PackageLoader('unbiased', 'html_template'), + autoescape=select_autoescape(['html', 'xml']) + ) + template = env.get_template('unbiased.jinja.html') timestamp=time.strftime("%a, %b %-d, %-I:%M%P %Z", time.localtime()) + sourcesStr = ', '.join(set([x.source for x in top_stories] + [x.source for x in middle_stories] + [x.source for x in bottom_stories])) + html = template.render( timestamp = timestamp, top_stories = top_stories, @@ -228,13 +221,11 @@ def buildOutput(newsSourceArr, webroot): sources = sourcesStr, ) - #return updated text return html -def printOutputHTML(outputHTML, outDir): - timestamp=time.strftime("%a, %b %-d, %-I:%M%P %Z", time.localtime()) - outputHTML=outputHTML.replace('xxTimexx', timestamp) +def writeOutputHTML(outputHTML, outDir): + timestamp = time.strftime("%a, %b %-d, %-I:%M%P %Z", time.localtime()) with open(os.path.join(outDir, 'index.html'), 'w') as fp: fp.write(outputHTML) -- cgit v1.2.3 From f3d9287481b0ebba2b6dcb687e461dbc79074ad1 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Wed, 19 Apr 2017 22:59:37 -0400 Subject: tweaking top story responsiveness --- unbiased/html_template/unbiased.css | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/unbiased/html_template/unbiased.css b/unbiased/html_template/unbiased.css index 1424ee9..60932d8 100755 --- a/unbiased/html_template/unbiased.css +++ b/unbiased/html_template/unbiased.css @@ -115,10 +115,12 @@ a:hover{ @media only screen and (max-width:500px){ .top-stories-img{ - width:auto; + width:inherit; + height:inherit; + max-width:350px; + padding-top:57.14%; } } - .top-stories-hed{ font-weight:bold; -- cgit v1.2.3 From 9a8eff98fc5dec755683ce1708bf0caf578c5752 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Wed, 19 Apr 2017 23:31:37 -0400 Subject: tweaking responsive css --- unbiased/html_template/unbiased.css | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/unbiased/html_template/unbiased.css b/unbiased/html_template/unbiased.css index 60932d8..24b1959 100755 --- a/unbiased/html_template/unbiased.css +++ b/unbiased/html_template/unbiased.css @@ -174,20 +174,18 @@ a:hover{ background-size: 100%; background-position: center center; float:left; - max-width:35%; } .middle-stories-hed{ font-size:1.2em; - float:left; - width:300px; margin-left:10px; color:#00f; + padding-left:150px; } @media only screen and (max-width:500px){ .middle-stories-hed{ - max-width:60%; + max-width:60%; } } -- cgit v1.2.3 From deca37e1ae9da82bfd4ef5edf95fd9c22b871cd0 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Thu, 20 Apr 2017 13:38:04 -0400 Subject: switch to dict logging and configure separate log writers for console and debug --- unbiased/main.py | 41 +++++++++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/unbiased/main.py b/unbiased/main.py index ba72710..df2b209 100755 --- a/unbiased/main.py +++ b/unbiased/main.py @@ -2,19 +2,48 @@ import argparse import logging +import logging.config import time from unbiased.unbiasedObjects import * from unbiased.unbiasedFunctions import * from unbiased.parser import * +logging.config.dictConfig({ + 'version': 1, + 'formatters': { + 'console': { + 'format': '%(levelname)s %(filename)s:%(lineno)d %(message)s', + }, + 'file': { + 'format': '%(asctime)s %(levelname)s %(filename)s:%(lineno)d %(message)s', + }, + }, + 'handlers': { + 'console': { + 'class': 'logging.StreamHandler', + 'level': 'INFO', + 'formatter': 'console', + }, + 'file': { + 'class': 'logging.handlers.RotatingFileHandler', + 'level': 'DEBUG', + 'formatter': 'file', + 'filename': '/opt/unbiased/logs/unbiased.debug.log', + 'maxBytes': 1024 * 1024, + 'backupCount': 3, + }, + }, + 'loggers': { + 'unbiased': { + 'handlers': ['console', 'file'], + }, + }, + 'root': { + 'level': 'DEBUG', + } +}) logger = logging.getLogger('unbiased') -logger.setLevel(logging.DEBUG) -ch = logging.StreamHandler() -ch.setLevel(logging.DEBUG) -ch.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(message)s')) -logger.addHandler(ch) - def main(): parser = argparse.ArgumentParser() -- cgit v1.2.3 From b936c9b7385bd4330c7f9fda3775f9dc1483a328 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Thu, 20 Apr 2017 13:48:17 -0400 Subject: read log dir from command line and quiet down console logging --- unbiased/main.py | 15 ++++++++++----- unbiased/unbiasedFunctions.py | 8 ++++---- unbiased/unbiasedObjects.py | 2 +- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/unbiased/main.py b/unbiased/main.py index df2b209..f81321e 100755 --- a/unbiased/main.py +++ b/unbiased/main.py @@ -9,7 +9,9 @@ from unbiased.unbiasedObjects import * from unbiased.unbiasedFunctions import * from unbiased.parser import * -logging.config.dictConfig({ +logger = logging.getLogger('unbiased') + +logging_config = { 'version': 1, 'formatters': { 'console': { @@ -29,7 +31,7 @@ logging.config.dictConfig({ 'class': 'logging.handlers.RotatingFileHandler', 'level': 'DEBUG', 'formatter': 'file', - 'filename': '/opt/unbiased/logs/unbiased.debug.log', + 'filename': '', 'maxBytes': 1024 * 1024, 'backupCount': 3, }, @@ -42,14 +44,17 @@ logging.config.dictConfig({ 'root': { 'level': 'DEBUG', } -}) -logger = logging.getLogger('unbiased') +} def main(): parser = argparse.ArgumentParser() - parser.add_argument('-w', '--webroot', default='/var/www/ubiased', help='location to write the output html') + parser.add_argument('-w', '--webroot', help='location of config file') + parser.add_argument('-l', '--log-dir', help='location to write logs') args = parser.parse_args() + logging_config['handlers']['file']['filename'] = os.path.join(args.log_dir, 'unbiased.debug.log') + logging.config.dictConfig(logging_config) + crawl_frequency = 600 while True: logger.info('Starting crawl') diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py index 2053ba5..46dae19 100644 --- a/unbiased/unbiasedFunctions.py +++ b/unbiased/unbiasedFunctions.py @@ -30,7 +30,7 @@ def buildArticle(url, sourceName, encoding=None):#, titleDelStart, titleDelEnd, try: res = requests.get(url) except Exception as ex: - logger.error("""ARTICLE DOWNLOADING ERROR + logger.debug("""ARTICLE DOWNLOADING ERROR SOURCE:\t{} URL:\t{}""".format(sourceName, url)) return None @@ -38,7 +38,7 @@ def buildArticle(url, sourceName, encoding=None):#, titleDelStart, titleDelEnd, if res.status_code == 200: content = res.text else: - logger.error("""ARTICLE DOWNLOADING ERROR + logger.debug("""ARTICLE DOWNLOADING ERROR SOURCE:\t{} URL:\t{}""".format(sourceName, url)) return None @@ -135,7 +135,7 @@ def buildArticle(url, sourceName, encoding=None):#, titleDelStart, titleDelEnd, return a except Exception: - logger.error("""ARTICLE PARSING ERROR + logger.debug("""ARTICLE PARSING ERROR SOURCE:\t{} URL:\t{}""".format(sourceName, url)) return None @@ -243,7 +243,7 @@ def pullImage(url, index, webroot, target_width=350, target_height=200): if res.status_code == 200: content = res.content else: - logger.error('Image not found: url={}'.format(url)) + logger.debug('Image not found: url={}'.format(url)) return '' img = Image.open(io.BytesIO(content)) # crop to aspect ratio diff --git a/unbiased/unbiasedObjects.py b/unbiased/unbiasedObjects.py index 9372d3a..7908fbb 100644 --- a/unbiased/unbiasedObjects.py +++ b/unbiased/unbiasedObjects.py @@ -90,5 +90,5 @@ class NewsSource(): elif level==3: self.h3Arr.append(article) else: - logger.error("Invalid level in NewsSource.addArtlce: " + level) + logger.debug("Invalid level in NewsSource.addArtlce: " + level) -- cgit v1.2.3 From 45926db9caed33062ab491df63f33ee3b3f5c468 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Fri, 21 Apr 2017 22:33:40 -0400 Subject: rewrite css to use flexbox for responsive display --- unbiased/html_template/unbiased.css | 276 ++++++++++++----------------- unbiased/html_template/unbiased.jinja.html | 83 +++++---- 2 files changed, 156 insertions(+), 203 deletions(-) diff --git a/unbiased/html_template/unbiased.css b/unbiased/html_template/unbiased.css index 24b1959..caf4691 100755 --- a/unbiased/html_template/unbiased.css +++ b/unbiased/html_template/unbiased.css @@ -1,220 +1,166 @@ -/*body{ - width:900px; - margin-left:auto; - margin-right:auto; -}*/ +body { + margin: 0; +} +a:link, a:visited, a:hover, a:active, a { + color: #00f; + text-decoration:none; +} -body{ - margin:0; +a:hover { + cursor:pointer; } -a:link, a:visited, a:hover, a:active { - color: #00f; - text-decoration:none; - } +hr { + max-width: 890px; + margin: 5px auto; + border: 0; + height: 3px; + background-color: #BB133E; +} -a:hover{ - cursor:pointer; +#page-header { + width: 100%; + text-align: center; + padding: .5em 0 1em; + margin-bottom: 1em; + border-bottom: 3px solid #BB133E; + background: #002147; } -#page-header{ - width:100%; - text-align:center; - padding:.5em 0 1em; - margin-bottom:1em; - border-bottom:3px solid #BB133E; - background:#002147; +.title { + font-size: 3em; } -.title{ - font-size:3em; +#title-1 { + font-style: italic; + color: #fff; } -#title-1{ - font-style:italic; - color:#fff; +#title-2 { + color: #fff; } -#title-2{ - color:#fff; +#subtitle { + font-size: 1.25em; + color: #ccc; } -#subtitle{ - font-size:1.25em; - color:#ccc; +#timestamp { + margin: .5em 0 0 0; + font-size: .8em; + color: #cc6; } -#timestamp{ - margin:.5em 0 0 0; - font-size:.8em; - color:#cc6; +#top-stories { + max-width: 900px; + display: flex; + flex-wrap: wrap; + margin: 5px auto; } -#page-container{ - width:900px; - margin-left:auto; - margin-right:auto; +.top-story { + flex: 1 0 350px; + margin: 5px; + padding: 10px; + border:2px solid #eee; } -@media only screen and (max-width:900px){ - #page-container{ - width:100% - } +.top-stories-img { + width: 100%; + padding-bottom: 57%; + background-size: 100%; + background-position: center center; } -#top-stories{ - width:95%; - display:block; - overflow:auto; - padding:10px; - margin-left:auto; - margin-right:auto; - text-align:center; - border-bottom: 3px solid #BB133E; - margin-bottom: 10px; -} - -.row{ - display:flex; -} - -.top-story{ - display:inline-block; - vertical-align:top; - text-align:left; - width:360px; - height:350px; - overflow:hidden; - background:#fff; - margin:10px; - padding:10px; - border:2px solid #ccc; - flex:1; -} - -@media only screen and (max-width:500px){ - .row{ - display:block; - } - .top-story{ - display:block; - width:auto; - height:auto; - } -} - -.top-stories-img{ - width:350px; - height:200px; - overflow:hidden; - background-size: 100%; - background-position: center center; - margin:0 auto; +.top-stories-hed { + font-size: 1.3em; + margin: 10px 0; + color: #00f; } -@media only screen and (max-width:500px){ - .top-stories-img{ - width:inherit; - height:inherit; - max-width:350px; - padding-top:57.14%; - } +.top-stories-desc { + font-size: 1em; } -.top-stories-hed{ - font-weight:bold; - font-size:1.35em; - margin:10px 10px 0; - color:#00f; +.c2 { + max-width: 900px; + display: flex; + flex-wrap: wrap; + margin: 5px auto; } -.top-stories-desc{ - font-size:1em; - padding-top:.5em; - margin:0 .75em; +.c2 hr { + display: none; } -#middle-stories{ - clear:both; - width:500px; - margin:0 auto; - padding:0; - display:block; - overflow:auto; - float:left; +#middle-stories { + flex: 7 0 200px; } -@media only screen and (max-width:500px){ - #middle-stories{ - width:100%; - float:none; - } +.middle-story { + margin: 5px; + border: 2px solid #eee; } -.middle-story{ - margin:5px 10px; - padding:10px; - background:#fff; - border:2px solid #ddd; - width:460px; - float:left; +.middle-story a { + padding: 10px; + display: inline-block; } -@media only screen and (max-width:500px){ - .middle-story{ - width:auto; - } +.middle-story a p { + margin: 0; } .middle-stories-img{ - width:150px; - height:100px; - overflow:hidden; + width: 150px; + height: 100px; background-size: 100%; background-position: center center; - float:left; + float: left; + margin-right: 10px; } -.middle-stories-hed{ - font-size:1.2em; - margin-left:10px; - color:#00f; - padding-left:150px; +#middle-stories a { + font-size: 1.1em; + color: #00f; } -@media only screen and (max-width:500px){ - .middle-stories-hed{ - max-width:60%; - } +#bottom-stories { + flex: 3 0 200px; + border: 2px solid #eee; + margin: 5px; } -#bottom-stories{ - margin:0 10px; - padding:10px; - display:block; - overflow:auto; - float:left; - width:350px; - border:5px solid #ddd; +.bottom-story { + padding: 10px; + color: #00f; } -@media only screen and (max-width:900px){ - #bottom-stories{ - width:auto; - border-width:3px; - float:none; - } +#sources { + margin: 2em 5px 0 5px; + font-size: .8em; } -.bottom-story{ color:#00f; - - padding:15px 0; - color:#00f; +@media (max-width: 900px) { + hr { + width: inherit; + margin: 5px; + } } -#sources{ - clear:both; - padding-top:4em; - font-size:.8em; +@media (max-width: 767px) { + .top-stories { + flex-wrap: nowrap; + flex-direction: column; + } + .top-story { + flex: 1 0 250px; + } + .c2 { + flex-direction: column; + } + .c2 hr { + display: inherit; + } } diff --git a/unbiased/html_template/unbiased.jinja.html b/unbiased/html_template/unbiased.jinja.html index 40c9582..fcca97f 100644 --- a/unbiased/html_template/unbiased.jinja.html +++ b/unbiased/html_template/unbiased.jinja.html @@ -9,64 +9,71 @@ UnBiased - -

    + -
    + + +
    + + {% for story in top_stories %} -
    +
    + +
    +
    {{ story.title|safe }}
    +
    +
    {{ story.description|safe|truncate(140) }}
    +
    - {% for story in top_stories %} + {% endfor %} -
    - -
    -
    {{ story.title|safe }}
    -
    -
    {{ story.description|safe|truncate(140) }}
    - {% endfor %} +
    -
    +
    -
    +
    - {% for story in middle_stories %} + {% for story in middle_stories %} - -
    -
    +
    + +

    +

    + {{ story.title|safe }} +

    +
    -
    {{ story.title|safe }}
    + + {% endfor %} +
    - - {% endfor %} +
    -
    +
    -
    + {% for story in bottom_stories %} - {% for story in bottom_stories %} +
    + {{ story.title|safe }} +
    -
    - {{ story.title|safe }} -
    + {% endfor %} - {% endfor %} +
    -
    +
    -
    +
    + Sources: {{ sources }} +
    -
    - Sources: {{ sources }} -
    - + -- cgit v1.2.3 From 1cbd15b3f35e162a21b2dc2ac784b9acf71b6c3d Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Fri, 21 Apr 2017 22:40:34 -0400 Subject: include favicons in the distribution --- setup.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 57c27c0..2761041 100644 --- a/setup.py +++ b/setup.py @@ -2,12 +2,14 @@ from setuptools import setup setup( name="unbiased", - version="0", + version="1", packages=['unbiased'], package_data={ 'unbiased': [ 'html_template/*.html', 'html_template/*.css', + 'html_template/*.ico', + 'html_template/*.png', ], }, install_requires=[ -- cgit v1.2.3 From 5cbce38c92953d24b48f714b1fc33d5cafdf874a Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Fri, 21 Apr 2017 23:38:55 -0400 Subject: fix safari rendering bug --- unbiased/html_template/unbiased.css | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unbiased/html_template/unbiased.css b/unbiased/html_template/unbiased.css index caf4691..5995dfd 100755 --- a/unbiased/html_template/unbiased.css +++ b/unbiased/html_template/unbiased.css @@ -150,7 +150,7 @@ hr { } @media (max-width: 767px) { - .top-stories { + #top-stories { flex-wrap: nowrap; flex-direction: column; } @@ -158,7 +158,7 @@ hr { flex: 1 0 250px; } .c2 { - flex-direction: column; + display: inherit; } .c2 hr { display: inherit; -- cgit v1.2.3 From 020c9908def3a816e05984c3ee55457fc423a931 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Sat, 22 Apr 2017 11:11:34 -0400 Subject: update command line arguments --- unbiased/main.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/unbiased/main.py b/unbiased/main.py index f81321e..caf77eb 100755 --- a/unbiased/main.py +++ b/unbiased/main.py @@ -48,11 +48,18 @@ logging_config = { def main(): parser = argparse.ArgumentParser() - parser.add_argument('-w', '--webroot', help='location of config file') - parser.add_argument('-l', '--log-dir', help='location to write logs') + parser.add_argument('webroot', help='location to write html output') + parser.add_argument('-l', '--log-dir', help='location to write detailed logs') + parser.add_argument('-d', '--debug', action='store_true', help='run in debug mode') args = parser.parse_args() - logging_config['handlers']['file']['filename'] = os.path.join(args.log_dir, 'unbiased.debug.log') + if args.log_dir: + logging_config['handlers']['file']['filename'] = os.path.join(args.log_dir, 'unbiased.debug.log') + else: + logging_config['loggers']['unbiased']['handlers'].remove('file') + del logging_config['handlers']['file'] + if args.debug: + logging_config['handlers']['console']['level'] = 'DEBUG' logging.config.dictConfig(logging_config) crawl_frequency = 600 -- cgit v1.2.3 From 76336db3237e122515f0ecec8c6a6c86790117c2 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Sat, 22 Apr 2017 11:12:13 -0400 Subject: systemd daemon service config --- etc/unbiased.service | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 etc/unbiased.service diff --git a/etc/unbiased.service b/etc/unbiased.service new file mode 100644 index 0000000..391e4ff --- /dev/null +++ b/etc/unbiased.service @@ -0,0 +1,12 @@ +[Unit] +Description=Unbiased News + +[Service] +Type=simple +ExecStart=/opt/unbiased/venv/bin/unbiased /opt/unbiased/webroot -l /opt/unbiased/logs +WorkingDirectory=/opt/unbiased +User=www-data +Restart=on-failure + +[Install] +WantedBy=multi-user.target -- cgit v1.2.3 From 218d3d2e6336f3ccf5e8329e2f0bb15efc7df94a Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Sat, 22 Apr 2017 11:12:42 -0400 Subject: gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 238da47..9e0f924 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ *.pyc *~ +.DS_Store __pycache__/ unbiased/scratch/*.html legacy_py/ -- cgit v1.2.3 From 53ccd0a1b21963021f2281c5df1a557f95514225 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Sat, 22 Apr 2017 11:16:39 -0400 Subject: remove old templates --- unbiased/html_template/newtemplate.html | 150 --------------------------- unbiased/html_template/template.html | 173 -------------------------------- 2 files changed, 323 deletions(-) delete mode 100644 unbiased/html_template/newtemplate.html delete mode 100755 unbiased/html_template/template.html diff --git a/unbiased/html_template/newtemplate.html b/unbiased/html_template/newtemplate.html deleted file mode 100644 index 0cec766..0000000 --- a/unbiased/html_template/newtemplate.html +++ /dev/null @@ -1,150 +0,0 @@ - - - - - - UnBiased - - - - - - - -
    - Sources: BBC US, NBC News, CBS News, The Blaze, Weekly Standard, New York Times, Fox News -
    - - diff --git a/unbiased/html_template/template.html b/unbiased/html_template/template.html deleted file mode 100755 index fc17006..0000000 --- a/unbiased/html_template/template.html +++ /dev/null @@ -1,173 +0,0 @@ - - - - - - - UnBiased - - - - - - - -
    - Sources: xxSourcesxx -
    - - -- cgit v1.2.3 From 1c825c79a17f9ba1e7c81668921c87ccecd672d3 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Sun, 23 Apr 2017 05:33:23 -0400 Subject: tweak the rendering --- unbiased/html_template/unbiased.jinja.html | 12 ++++++------ unbiased/unbiasedFunctions.py | 6 ++++-- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/unbiased/html_template/unbiased.jinja.html b/unbiased/html_template/unbiased.jinja.html index fcca97f..4a07d0b 100644 --- a/unbiased/html_template/unbiased.jinja.html +++ b/unbiased/html_template/unbiased.jinja.html @@ -24,10 +24,10 @@
    -
    -
    {{ story.title|safe }}
    +
    +
    {{ story.title }}
    -
    {{ story.description|safe|truncate(140) }}
    +
    {{ story.description }}
    {% endfor %} @@ -45,8 +45,8 @@

    -

    - {{ story.title|safe }} +
    + {{ story.title }}

    @@ -62,7 +62,7 @@ {% for story in bottom_stories %}
    - {{ story.title|safe }} + {{ story.title }}
    {% endfor %} diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py index 46dae19..cb13a44 100644 --- a/unbiased/unbiasedFunctions.py +++ b/unbiased/unbiasedFunctions.py @@ -131,7 +131,7 @@ def buildArticle(url, sourceName, encoding=None):#, titleDelStart, titleDelEnd, logger.debug(description) - a=Article(title, url, img, description, sourceName, author) + a=Article(html.unescape(title), url, img, html.unescape(description), sourceName, html.unescape(author)) return a except Exception: @@ -209,12 +209,14 @@ def buildOutput(top_stories, middle_stories, bottom_stories): ) template = env.get_template('unbiased.jinja.html') - timestamp=time.strftime("%a, %b %-d, %-I:%M%P %Z", time.localtime()) + timestamp = time.strftime("%a, %b %-d, %-I:%M%P %Z", time.localtime()) + utime = int(time.time()) sourcesStr = ', '.join(set([x.source for x in top_stories] + [x.source for x in middle_stories] + [x.source for x in bottom_stories])) html = template.render( timestamp = timestamp, + utime = utime, top_stories = top_stories, middle_stories = middle_stories, bottom_stories = bottom_stories, -- cgit v1.2.3 From 91851fc975169fe63d7e646d03cd1e7ad57553e5 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Fri, 2 Jun 2017 11:28:21 -0400 Subject: too many opinions buffalo --- unbiased/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unbiased/parser.py b/unbiased/parser.py index 41727f5..05a7fc1 100755 --- a/unbiased/parser.py +++ b/unbiased/parser.py @@ -858,7 +858,7 @@ def buildFoxNews(): #REMOVE BAD STORIES badTitleArr=['O'Reilly', 'Fox News', 'Brett Baier', 'Tucker'] badDescArr=['Sean Hannity'] - badAuthorArr=['Bill O\'Reilly', 'Sean Hannity'] + badAuthorArr=['Bill O\'Reilly', 'Sean Hannity', 'Howard Kurtz'] badImgArr=['http://www.foxnews.com/content/dam/fox-news/logo/og-fn-foxnews.jpg'] badURLArr=['http://www.foxnews.com/opinion', 'videos.foxnews.com'] fox=removeBadStories(fox, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr) -- cgit v1.2.3 From d64c47dbe07f944703c01179ccba57a8e6bfb523 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Fri, 19 May 2017 18:57:35 -0400 Subject: fix padding between sections --- unbiased/html_template/unbiased.css | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unbiased/html_template/unbiased.css b/unbiased/html_template/unbiased.css index 5995dfd..a68a4c2 100755 --- a/unbiased/html_template/unbiased.css +++ b/unbiased/html_template/unbiased.css @@ -158,7 +158,7 @@ hr { flex: 1 0 250px; } .c2 { - display: inherit; + flex-direction: column; } .c2 hr { display: inherit; -- cgit v1.2.3 From a82318fbdfc1af624fd9bf9bbae316ad45f43611 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Fri, 2 Jun 2017 11:30:14 -0400 Subject: tweaks for ios --- unbiased/html_template/unbiased.css | 6 ++++++ unbiased/html_template/unbiased.jinja.html | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/unbiased/html_template/unbiased.css b/unbiased/html_template/unbiased.css index a68a4c2..dc99ab7 100755 --- a/unbiased/html_template/unbiased.css +++ b/unbiased/html_template/unbiased.css @@ -163,4 +163,10 @@ hr { .c2 hr { display: inherit; } + #middle-stories { + flex: inherit; + } + #bottom-stories { + flex: inherit; + } } diff --git a/unbiased/html_template/unbiased.jinja.html b/unbiased/html_template/unbiased.jinja.html index 4a07d0b..0d191e7 100644 --- a/unbiased/html_template/unbiased.jinja.html +++ b/unbiased/html_template/unbiased.jinja.html @@ -1,7 +1,7 @@ - + -- cgit v1.2.3