UnBiased

From 5b0c9c5daa36878513bcc5edbe87a5fe52fdbb82 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Mon, 17 Apr 2017 00:34:26 -0400 Subject: get it to run from the package --- setup.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 setup.py (limited to 'setup.py') diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..8b73e6d --- /dev/null +++ b/setup.py @@ -0,0 +1,20 @@ +from setuptools import setup + +setup( + name="unbiased", + version="0", + packages=['unbiased'], + package_data={ + 'unbiased': [ + 'html_template/*.html', + 'html_template/*.css', + ], + }, + install_requires=[ + ], + entry_points={ + 'console_scripts': [ + 'unbiased = unbiased.main:main', + ], + }, +) -- cgit v1.2.3 From fd5227f122adf65b8f5340751e037fce67e4d2c4 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Mon, 17 Apr 2017 15:52:21 -0400 Subject: use jinja templates to build the output --- setup.py | 1 + unbiased/html_template/unbiased.jinja.html | 69 ++++++++++++++++++++++++++++++ unbiased/unbiasedFunctions.py | 52 +++++++++++----------- 3 files changed, 98 insertions(+), 24 deletions(-) create mode 100644 unbiased/html_template/unbiased.jinja.html (limited to 'setup.py') diff --git a/setup.py b/setup.py index 8b73e6d..0b43b93 100644 --- a/setup.py +++ b/setup.py @@ -11,6 +11,7 @@ setup( ], }, install_requires=[ + 'jinja2', ], entry_points={ 'console_scripts': [ diff --git a/unbiased/html_template/unbiased.jinja.html b/unbiased/html_template/unbiased.jinja.html new file mode 100644 index 0000000..297c4c4 --- /dev/null +++ b/unbiased/html_template/unbiased.jinja.html @@ -0,0 +1,69 @@ + + + + + + + UnBiased + + + + + +

+ +

+ + {% for story in top_stories %} + +

+ +

+ + {% endfor %} + +

+ +

+ + {% for story in middle_stories %} + + +

+ + + {% endfor %} + +

+ +

+ + {% for story in bottom_stories %} + +

+ {{ story.title }} +

+ + {% endfor %} + +

+ +

+ Sources: {{ sources }} +

+ + diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py index 6210ba8..192de8c 100644 --- a/unbiased/unbiasedFunctions.py +++ b/unbiased/unbiasedFunctions.py @@ -127,9 +127,13 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t def buildOutput(newsSourceArr): #read in the template html file - template=pkgutil.get_data('unbiased', 'html_template/template.html') - template = template.decode('utf8') - + from jinja2 import Environment, PackageLoader, select_autoescape + env = Environment( + loader=PackageLoader('unbiased', 'html_template'), + autoescape=select_autoescape(['html', 'xml']) + ) + template = env.get_template('unbiased.jinja.html') + #set the random order for sources h1RandomSources=[] while len(h1RandomSources)<4: @@ -139,9 +143,9 @@ def buildOutput(newsSourceArr): h1RandomSources.append(x) else: print('\n\n@@@@\nNo H1 stories in '+newsSourceArr[x].name+'\n@@@@\n\n') - + #For h2s and h3s, select N random sources (can repeat), then - #a non-repetitive random article from within + #a non-repetitive random article from within h2RandomPairs=[] while len(h2RandomPairs) < 6: x=random.sample(range(len(newsSourceArr)), 1)[0] @@ -165,34 +169,25 @@ def buildOutput(newsSourceArr): else: print('\n\n@@@@\nNo H3 stories in '+newsSourceArr[x].name+'\n@@@@\n\n') - #replace html template locations with data from newsSourceArr + # collect articles for each section + top_stories = [] for i in range(len(h1RandomSources)): source=newsSourceArr[h1RandomSources[i]] randomArticle=random.sample(range(len(source.h1Arr)), 1)[0] article=source.h1Arr[randomArticle] - template=template.replace('xxURL1-'+str(i+1)+'xx', article.url) - template=template.replace('xxTitle1-'+str(i+1)+'xx', article.title) - template=template.replace('xxImg1-'+str(i+1)+'xx', article.img) - desc=article.description - if len(desc)>144: - desc=desc[:141] - desc=desc.split()[:-1] - desc=' '.join(desc)+' ...' - template=template.replace('xxDesc1-'+str(i+1)+'xx', desc) + top_stories.append(article) + middle_stories = [] for i in range(len(h2RandomPairs)): pair=h2RandomPairs[i] article=newsSourceArr[pair[0]].h2Arr[pair[1]] - template=template.replace('xxURL2-'+str(i+1)+'xx', article.url) - template=template.replace('xxTitle2-'+str(i+1)+'xx', article.title) - template=template.replace('xxImg2-'+str(i+1)+'xx', article.img) + middle_stories.append(article) + bottom_stories = [] for i in range(len(h3RandomPairs)): pair=h3RandomPairs[i] article=newsSourceArr[pair[0]].h3Arr[pair[1]] - template=template.replace('xxURL3-'+str(i+1)+'xx', article.url) - template=template.replace('xxTitle3-'+str(i+1)+'xx', article.title) - template=template.replace('xxImg3-'+str(i+1)+'xx', article.img) + bottom_stories.append(article) sourcesStr='' @@ -200,11 +195,20 @@ def buildOutput(newsSourceArr): sourcesStr+=newsSourceArr[i].name+', ' sourcesStr+=newsSourceArr[-1].name print('Successfully parsed: '+sourcesStr) - template=template.replace('xxSourcesxx', sourcesStr) - + + timestamp=time.strftime("%a, %b %-d, %-I:%M%P %Z", time.localtime()) + + html = template.render( + timestamp = timestamp, + top_stories = top_stories, + middle_stories = middle_stories, + bottom_stories = bottom_stories, + sources = sourcesStr, + ) + #return updated text - return template + return html def printOutputHTML(outputHTML, outDir): timestamp=time.strftime("%a, %b %-d, %-I:%M%P %Z", time.localtime()) -- cgit v1.2.3 From 8bce5c2280441760db850d92d651d2fb0f181c50 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Mon, 17 Apr 2017 21:53:42 -0400 Subject: pull the images locally and resize --- setup.py | 1 + unbiased/main.py | 2 +- unbiased/unbiasedFunctions.py | 27 +++++++++++++++++++++++++-- 3 files changed, 27 insertions(+), 3 deletions(-) (limited to 'setup.py') diff --git a/setup.py b/setup.py index 0b43b93..2755304 100644 --- a/setup.py +++ b/setup.py @@ -12,6 +12,7 @@ setup( }, install_requires=[ 'jinja2', + 'Pillow', ], entry_points={ 'console_scripts': [ diff --git a/unbiased/main.py b/unbiased/main.py index 159a98b..88ceb7e 100755 --- a/unbiased/main.py +++ b/unbiased/main.py @@ -62,7 +62,7 @@ def run(webroot, scratch): newsSourceArr=buildNewsSourceArr(sourceList, scratch) #build the output file HTML - outputHTML=buildOutput(newsSourceArr) + outputHTML=buildOutput(newsSourceArr, webroot) #print the output file HTML printOutputHTML(outputHTML, webroot) diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py index 192de8c..16ea07d 100644 --- a/unbiased/unbiasedFunctions.py +++ b/unbiased/unbiasedFunctions.py @@ -7,6 +7,8 @@ import time from unbiased.unbiasedObjects import * +from PIL import Image + #take in a url and delimiters, return twitter card def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd): @@ -125,7 +127,7 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t return None -def buildOutput(newsSourceArr): +def buildOutput(newsSourceArr, webroot): #read in the template html file from jinja2 import Environment, PackageLoader, select_autoescape env = Environment( @@ -170,17 +172,25 @@ def buildOutput(newsSourceArr): print('\n\n@@@@\nNo H3 stories in '+newsSourceArr[x].name+'\n@@@@\n\n') # collect articles for each section + image_index = 0 + top_stories = [] for i in range(len(h1RandomSources)): source=newsSourceArr[h1RandomSources[i]] randomArticle=random.sample(range(len(source.h1Arr)), 1)[0] article=source.h1Arr[randomArticle] + img_name = pullImage(article.img, image_index, webroot, 350, 200) + image_index += 1 + article.img = img_name top_stories.append(article) middle_stories = [] for i in range(len(h2RandomPairs)): pair=h2RandomPairs[i] article=newsSourceArr[pair[0]].h2Arr[pair[1]] + img_name = pullImage(article.img, image_index, webroot, 150, 100) + image_index += 1 + article.img = img_name middle_stories.append(article) bottom_stories = [] @@ -189,7 +199,6 @@ def buildOutput(newsSourceArr): article=newsSourceArr[pair[0]].h3Arr[pair[1]] bottom_stories.append(article) - sourcesStr='' for i in range(len(newsSourceArr)-1): sourcesStr+=newsSourceArr[i].name+', ' @@ -274,3 +283,17 @@ def buildNewsSourceArr(sourceList, scratchDir): #return the original sourceList, #since everything should have been modified in place return sourceList + +def pullImage(url, index, webroot, width=350, height=200): + extension = url.split('.')[-1].split('?')[0] + img_name = 'img{}.{}'.format(index, extension) + out_file = os.path.join(webroot, img_name) + try: + subprocess.check_call(['wget', '-q', '-O', out_file, '--no-check-certificate', url]) + except Exception: + return '' + img = Image.open(out_file) + img.resize((width, height)) + jpg_name = 'img{}.jpg'.format(index) + img.save(os.path.join(webroot, jpg_name), 'JPEG') + return jpg_name -- cgit v1.2.3 From 7a8efb94dc2463a6d30afc77f10df78ebfa4c353 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Wed, 19 Apr 2017 16:39:03 -0400 Subject: replace wget with requests library --- setup.py | 1 + unbiased/html_template/unbiased.css | 8 +-- unbiased/main.py | 2 +- unbiased/parser.py | 29 ++++++----- unbiased/unbiasedFunctions.py | 98 +++++++++++-------------------------- 5 files changed, 49 insertions(+), 89 deletions(-) (limited to 'setup.py') diff --git a/setup.py b/setup.py index 2755304..57c27c0 100644 --- a/setup.py +++ b/setup.py @@ -13,6 +13,7 @@ setup( install_requires=[ 'jinja2', 'Pillow', + 'requests', ], entry_points={ 'console_scripts': [ diff --git a/unbiased/html_template/unbiased.css b/unbiased/html_template/unbiased.css index 6817cc9..1424ee9 100755 --- a/unbiased/html_template/unbiased.css +++ b/unbiased/html_template/unbiased.css @@ -108,8 +108,8 @@ a:hover{ width:350px; height:200px; overflow:hidden; - background-size: auto 234px;/*cover;*/ - background-position: top center;/*center center;*/ + background-size: 100%; + background-position: center center; margin:0 auto; } @@ -169,8 +169,8 @@ a:hover{ width:150px; height:100px; overflow:hidden; - background-size: auto 117px;/*cover;*/ - background-position: top center;/*center center;*/ + background-size: 100%; + background-position: center center; float:left; max-width:35%; } diff --git a/unbiased/main.py b/unbiased/main.py index c8a113e..c760788 100755 --- a/unbiased/main.py +++ b/unbiased/main.py @@ -76,7 +76,7 @@ def run(webroot, scratch): logger.debug('Build failed, retrying. source={} ex={}'.format(source, ex)) #scrape all urls and build data structure - newsSourceArr=buildNewsSourceArr(sourceList, scratch) + newsSourceArr = sourceList #build the output file HTML outputHTML=buildOutput(newsSourceArr, webroot, scratch) diff --git a/unbiased/parser.py b/unbiased/parser.py index 2bba27d..0a8398c 100755 --- a/unbiased/parser.py +++ b/unbiased/parser.py @@ -4,6 +4,9 @@ import logging import os import re import subprocess +import urllib.parse + +import requests from unbiased.unbiasedObjects import * from unbiased.unbiasedFunctions import buildArticle @@ -16,21 +19,11 @@ Takes in a URL, downloads the file to a temp file, reads the file into a string, and returns that string ''' def urlToContent(url, scratchDir, sourceEncoding='utf8'): - temp_file = os.path.join(scratchDir, 'temp1.html') - - #download file - #os.system('wget -q -O scratch/temp1.html --no-check-certificate '+url) - subprocess.check_call(['wget', '-q', '-O', temp_file, '--no-check-certificate', url]) - - #read file - if sourceEncoding=='utf8': - f=open(temp_file, 'r', encoding="utf8") + res = requests.get(url) + if res.status_code == 200: + return res.text else: - f=open(temp_file, 'r', encoding="latin-1") - content=f.read() - f.close() - - return content + raise Exception("Failed to download {}".format(url)) ''' @@ -39,6 +32,13 @@ calls the file scraper and appends the new Article object. Returns a newsSource2 object ''' def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir): + + url_parts = urllib.parse.urlparse(url) + scheme = url_parts.scheme + h1URLs = [urllib.parse.urlparse(x, scheme=scheme).geturl() for x in h1URLs] + h2URLs = [urllib.parse.urlparse(x, scheme=scheme).geturl() for x in h2URLs] + h3URLs = [urllib.parse.urlparse(x, scheme=scheme).geturl() for x in h3URLs] + h1Arr=[] a=buildArticle(h1URLs[0], name, scratchDir) if a==None: @@ -54,7 +54,6 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir): else: logger.debug('H2 Nonetype in '+name) - h3Arr=[] for x in h3URLs: a=buildArticle(x, name, scratchDir) diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py index 415a3cc..0181beb 100644 --- a/unbiased/unbiasedFunctions.py +++ b/unbiased/unbiasedFunctions.py @@ -1,4 +1,5 @@ import html +import io import logging import os import pkgutil @@ -6,10 +7,12 @@ import random import re import subprocess import time - -from unbiased.unbiasedObjects import * +import urllib.parse from PIL import Image +import requests + +from unbiased.unbiasedObjects import * logger = logging.getLogger('unbiased') @@ -21,16 +24,25 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t logger.debug(sourceName) logger.debug(url) - temp_article = os.path.join(scratchDir, 'temp_article.html') + url_parts = urllib.parse.urlparse(url) + scheme = url_parts.scheme #download url - #os.system('wget -q -O scratch/temp_article.html --no-check-certificate '+url) - subprocess.check_call(['wget', '-q', '-O', temp_article, '--no-check-certificate', url]) + try: + res = requests.get(url) + except Exception as ex: + logger.error("""ARTICLE DOWNLOADING ERROR + SOURCE:\t{} + URL:\t{}""".format(sourceName, url)) + return None - #read the file in - f=open(temp_article, 'r', encoding="utf8") - content=f.read() - f.close() + if res.status_code == 200: + content = res.text + else: + logger.error("""ARTICLE DOWNLOADING ERROR + SOURCE:\t{} + URL:\t{}""".format(sourceName, url)) + return None try: if sourceName=='The Guardian US': @@ -61,6 +73,8 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t #trim to just before it then lop it off img=img[:-1].strip() img=img[:-1] + # fix the scheme if it's missing + img = urllib.parse.urlparse(img, scheme=scheme).geturl() if debugging: logger.debug(img) @@ -232,68 +246,16 @@ def printOutputHTML(outputHTML, outDir): with open(os.path.join(outDir, filename), 'wb') as fp: fp.write(data) -def buildNewsSourceArr(sourceList, scratchDir): - - #build the data structure - i=0 - listLen=len(sourceList) - while i < listLen: - source=sourceList[i] - - if type(source) is NewsSource2: - i+=1 - continue - - url=source.url - - temp_file = os.path.join(scratchDir, 'temp{}.html'.format(i)) - - #download file - #os.system('wget -q -O scratch/temp'+str(i)+'.html --no-check-certificate '+url) - subprocess.check_call(['wget', '-q', '-O', temp_file, '--no-check-certificate', url]) - - #read file - f=open(temp_file, 'r', encoding="utf8") - content=f.read() - f.close() - - #delete file MAYBE DON'T DO THIS? CAUSES OS ERRORS - #os.remove(temp_file) - - #add stories etc to the NewsSource object - h1s, h2s, h3s=extractURLs(content, source) - - #build the Article objects and add to newsSource's appropriate list - if h1s!=None and h2s!=None: - for url in h1s: - article=buildArticle(url, source.name, scratchDir) - if article!=None: source.addArticle(article, 1) #sourceList[i].h1Arr.append(article) - for url in h2s: - article=buildArticle(url, source.name, scratchDir) - if article!=None: sourceList[i].h2Arr.append(article) - for url in h3s: - article=buildArticle(url, source.name, scratchDir) - if article!=None: sourceList[i].h3Arr.append(article) - i+=1 - else: - sourceList.remove(source) - listLen-=1 - - - #return the original sourceList, - #since everything should have been modified in place - return sourceList - def pullImage(url, index, webroot, scratch, target_width=350, target_height=200): extension = url.split('.')[-1].split('?')[0] img_name = 'img{}.{}'.format(index, extension) - tmp_file = os.path.join(scratch, img_name) - try: - subprocess.check_call(['wget', '-q', '-O', tmp_file, '--no-check-certificate', url]) - except Exception as ex: - logger.error('Failed to pull image: url={} ex={}'.format(url, ex)) + res = requests.get(url) + if res.status_code == 200: + content = res.content + else: + logger.error('Image not found: url={}'.format(url)) return '' - img = Image.open(tmp_file) + img = Image.open(io.BytesIO(content)) # crop to aspect ratio target_ar = target_width / target_height left, top, right, bottom = img.getbbox() @@ -315,6 +277,4 @@ def pullImage(url, index, webroot, scratch, target_width=350, target_height=200) jpg_name = 'img{}.jpg'.format(index) out_file = os.path.join(webroot, jpg_name) img.save(out_file, 'JPEG') - if tmp_file != out_file: - os.remove(tmp_file) return jpg_name -- cgit v1.2.3 From 1cbd15b3f35e162a21b2dc2ac784b9acf71b6c3d Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Fri, 21 Apr 2017 22:40:34 -0400 Subject: include favicons in the distribution --- setup.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'setup.py') diff --git a/setup.py b/setup.py index 57c27c0..2761041 100644 --- a/setup.py +++ b/setup.py @@ -2,12 +2,14 @@ from setuptools import setup setup( name="unbiased", - version="0", + version="1", packages=['unbiased'], package_data={ 'unbiased': [ 'html_template/*.html', 'html_template/*.css', + 'html_template/*.ico', + 'html_template/*.png', ], }, install_requires=[ -- cgit v1.2.3