From 4b5f6067f05b5e35555f8e55219808470f9d664f Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Sat, 14 Oct 2017 19:45:13 -0400 Subject: code cleanup --- unbiased/main.py | 2 +- unbiased/parser.py | 986 ------------------------------------------ unbiased/spotCheck.py | 41 -- unbiased/unbiasedFunctions.py | 241 ----------- unbiased/unbiasedObjects.py | 97 ----- unbiased/util.py | 113 +++++ 6 files changed, 114 insertions(+), 1366 deletions(-) delete mode 100755 unbiased/parser.py delete mode 100755 unbiased/spotCheck.py delete mode 100644 unbiased/unbiasedFunctions.py delete mode 100644 unbiased/unbiasedObjects.py create mode 100644 unbiased/util.py diff --git a/unbiased/main.py b/unbiased/main.py index 89071b1..19fd05b 100755 --- a/unbiased/main.py +++ b/unbiased/main.py @@ -6,7 +6,7 @@ import logging import logging.config import time -from unbiased.unbiasedFunctions import pickStories, pullImage, buildOutput, write_files, write_static_files +from unbiased.util import pickStories, pullImage, buildOutput, write_files, write_static_files from unbiased.sources import get_sources logger = logging.getLogger('unbiased') diff --git a/unbiased/parser.py b/unbiased/parser.py deleted file mode 100755 index 399e0f2..0000000 --- a/unbiased/parser.py +++ /dev/null @@ -1,986 +0,0 @@ -#!/usr/bin/env python3 - -import logging -import os -import re -import urllib.parse - -from bs4 import BeautifulSoup -import requests - -from unbiased.unbiasedObjects import * -from unbiased.unbiasedFunctions import buildArticle - -logger = logging.getLogger('unbiased') - - -''' -Takes in a URL, downloads the file to a temp file, -reads the file into a string, and returns that string -''' -def urlToContent(url, sourceEncoding='utf8'): - res = requests.get(url) - if res.status_code == 200: - return res.text - else: - raise Exception("Failed to download {}".format(url)) - - -''' -Creates a new newsSource2 object. For each URL in h1-h3URLs, -calls the file scraper and appends the new Article object. -Returns a newsSource2 object -''' -def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs): - - url_parts = urllib.parse.urlparse(url) - scheme = url_parts.scheme - h1URLs = [urllib.parse.urlparse(x, scheme=scheme).geturl() for x in h1URLs] - h2URLs = [urllib.parse.urlparse(x, scheme=scheme).geturl() for x in h2URLs] - h3URLs = [urllib.parse.urlparse(x, scheme=scheme).geturl() for x in h3URLs] - - h1Arr=[] - a=buildArticle(h1URLs[0], name) - if a==None: - logger.debug('H1 Nonetype in '+name) - else: - h1Arr.append(a) - - h2Arr=[] - for x in h2URLs: - a=buildArticle(x, name) - if a!=None: - h2Arr.append(a) - else: - logger.debug('H2 Nonetype in '+name) - - h3Arr=[] - for x in h3URLs: - a=buildArticle(x, name) - if a!=None: - h3Arr.append(a) - else: - logger.debug('H3 Nonetype in '+name) - - #BUILD THE NEWS SOURCE - newsSource=NewsSource2(name, url, h1Arr, h2Arr, h3Arr) - - return newsSource - - -''' -Some sites will replicate URLs across the page. This function removes them. -Check hierarchically: if h3 exists in h1s or h2s, remove from h3s; -if h2 exists in h1s, remove from h2s - -also check partial URLs (e.g. nytimes.com/story.html is the same as -nytimes.com/story.html?var=x -''' -def removeDuplicates(h1s, h2s, h3s): - #Assume h1s is one element, and keep it - - #remove h2 duplicates - removeArr=[] - for i in range(len(h2s)): - #check internally - for j in range(len(h2s)): - if i==j: - continue - else: - if h2s[i] in h2s[j]: - removeArr.append(h2s[j]) - #check against h1s - for k in range(len(h1s)): - if (h2s[i] in h1s[k]) or (h1s[k] in h2s[i]): - removeArr.append(h2s[i]) - for x in removeArr: - h2s.remove(x) - - #remove h3 duplicates - removeArr=[] - for i in range(len(h3s)): - #check internally - for j in range(len(h3s)): - if i==j: - continue - else: - if h3s[i] in h3s[j]: - removeArr.append(h3s[j]) - #check against h1s and h2s - h1and2=h1s+h2s - for k in range(len(h1and2)): - if (h3s[i] in h1and2[k]) or (h1and2[k] in h3s[i]): - removeArr.append(h3s[i]) - for x in removeArr: - if x in h3s: - h3s.remove(x) - - - return h1s, h2s, h3s - - - -def removalNotification(source, title, reason, value): - logger.debug("""Story removed - SOURCE:\t{} - TITLE:\t{}) - REASON:\t{} - VALUE:\t{}""".format(source, title, reason, value)) - - -def removeBadStoriesHelper(source, element, badStringList, article_tiers): - if badStringList is None: - return - for tier, articles in enumerate(article_tiers): - for idx, article in enumerate(articles): - if article is None: - logger.debug("None type found in removeBadStoriesHelper for {}".format(source.name)) - break - for item in badStringList: - if item in getattr(article, element): - article_tiers[tier].remove(article) - # if it's in the h1 slot, bump up the - # first h2 into the h1 slot - if tier == 0 and len(article_tiers[1]) > 0: - article_tiers[0].append(article_tiers[1][0]) - article_tiers[1].remove(article_tiers[1][0]) - removalNotification(source.name, article.title, element, item) - - -def removeBadStories(source, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr=None): - - arr=[source.h1Arr, source.h2Arr, source.h3Arr] - - removeBadStoriesHelper(source, "title", badTitleArr, arr) - removeBadStoriesHelper(source, "description", badDescArr, arr) - removeBadStoriesHelper(source, "author", badAuthorArr, arr) - removeBadStoriesHelper(source, "img", badImgArr, arr) - removeBadStoriesHelper(source, "url", badURLArr, arr) - - return source - - - - -def buildTheHill(): - url='http://thehill.com' - name='The Hill' - - #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) - - #get main headline - h1=content - h1=h1.split('
', 1)[1] - h1=h1.split('', 1)[1] - h2=h2.split('', 1)[0] - while '
', 1)[1] - h3=h3.split('', 1)[0] - while '
')[2:] - for x in h2: - if '

', 1)[1] - h3=h3.split('
', 1)[0]#'', 1)[0] - while '
  • ' in h2: - h2=h2.split('
  • ', 1)[1] - h2=h2.split('', 1)[1] - h2=h2.split('
    ', 1)[1] - h3=h3.split('Watch/Listen', 1)[0] - while '
    ', 1)[1] - h1=h1.split('href="', 1)[1] - h1=h1.split('"', 1)[0] - h1s=[h1] - - #GET SECONDARY HEADLINES - h2=content - h2s=[] - h2=h2.split('
    ', 1)[1] - h2=h2.split('
    ' in h2: - h2=h2.split('
    ', 1)[1] - h2=h2.split('href="', 1)[1] - x=h2.split('"', 1)[0] - if h1 not in x: - h2s.append(x) - - #GET TERTIARY HEADLINES - h3=content - h3s=[] - h3=h3.split('Today\'s Standard', 1)[1] - h3=h3.split('
    ' in h3: - h3=h3.split('
    ', 1)[1] - h3=h3.split('href="', 1)[1] - x=h3.split('"', 1)[0] - if h1 not in x: - h3s.append(x) - - #Need to add URL prefix to all URLs - for i in range(len(h1s)): - h1s[i]=url+h1s[i] - for i in range(len(h2s)): - h2s[i]=url+h2s[i] - for i in range(len(h3s)): - h3s[i]=url+h3s[i] - - - h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - wkl=buildNewsSource2(name, url, h1s, h2s, h3s) - - #REMOVE BAD STORIES - badTitleArr=None - ## if flagged again, remove Micah Mattix - badDescArr=['Matt Labash'] - badAuthorArr=['MATT LABASH', 'TWS PODCAST', 'ERIC FELTEN', 'Steven J. Lenzner', 'MARK HEMINGWAY'] - badImgArr=['http://www.weeklystandard.com/s3/tws15/images/twitter/tws-twitter_1024x512.png'] - wkl=removeBadStories(wkl, badTitleArr, badDescArr, badAuthorArr, badImgArr) - - return wkl - - - - -def buildNPR(): - url='http://www.npr.org/sections/news/' - name='NPR' - - #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) - - #get main headline - h1=content - h1=h1.split('', 1)[1] - h1=h1.split('', 1)[1] - h2=h2.split('', 1)[0] - while '
    ' in h2: - h2=h2.split('
    ', 1)[1] - h2=h2.split(' a') - h2s = [x['href'] for x in h2s] - h2s = ['http:' + x if x.startswith('//') else x for x in h2s] - - #GET TERTIARY HEADLINES - h3s = [] - for ul in soup.find('section', id='latest').find_all('ul', recursive=False): - for li in ul.find_all('li', recursive=False): - h3s.append(li.find('a')['href']) - h3s = ['http:' + x if x.startswith('//') else x for x in h3s] - - h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s) - fox=buildNewsSource2(name, url, h1s, h2s, h3s) - - #REMOVE BAD STORIES - badTitleArr=['O'Reilly', 'Fox News', 'Brett Baier', 'Tucker'] - badDescArr=['Sean Hannity'] - badAuthorArr=['Bill O\'Reilly', 'Sean Hannity', 'Howard Kurtz'] - badImgArr=['http://www.foxnews.com/content/dam/fox-news/logo/og-fn-foxnews.jpg'] - badURLArr=['http://www.foxnews.com/opinion', 'videos.foxnews.com'] - fox=removeBadStories(fox, badTitleArr, badDescArr, badAuthorArr, badImgArr, badURLArr) - - return fox - - - -def buildNYT(): - url='http://www.nytimes.com' - name='New York Times' - - #DOWNLOAD HOMEPAGE CONTENT - content=urlToContent(url) - - #get main headline - #this will likely need if/else logic - h1=content - - if 'story theme-summary banner' in h1: - #This is with a large headline over a and b columns - h1=h1.split('story theme-summary banner', 1)[1] - h1=h1.split('', 1)[1] - h1=h1.split('
    ', 1)[1] - h2=h2.split('', 1)[0] - #remove "collection" sets - while '
    ' in h2: - arr=h2.split('
    ', 1) - h2=arr[0]+arr[1].split('', 1)[1] - #Grab the remaining URLs - while '', 1)[1] - h3=h3.split('', 1)[0] - #remove "collection" sets - while '
    ' in h3: - arr=h3.split('
    ', 1) - h3=arr[0]+arr[1].split('', 1)[1] - #Grab the remaining URLs - while '' in h3: - h3=h3.split('', 1)[1] - h3=h3.split('', 1)[0] - elif '/video/the-daily-360' in h3: - h3=h3.split('/video/the-daily-360')[-1] - h3=h3.split('More News', 1)[0] - #remove "collection" sets - while '
    ' in h2: - arr=h3.split('
    ', 1) - h3=arr[0]+arr[1].split('', 1)[1] - - #Grab the remaining URLs - while ' - -
    - -
    -

    Top News

    - -
    - - -''' diff --git a/unbiased/spotCheck.py b/unbiased/spotCheck.py deleted file mode 100755 index 7ce50d3..0000000 --- a/unbiased/spotCheck.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python3 - -import sys - -from unbiased.parser import * -from unbiased.unbiasedObjects import * - -def spotCheck(src): - - fns = {'hil' : buildTheHill, - 'cbs' : buildCBS, - 'npr' : buildNPR, - 'fox' : buildFoxNews, - 'gdn' : buildGuardian, - 'blz' : buildBlaze, - 'bbc' : buildBBC, - 'nbc' : buildNBC, - 'wat' : buildWashTimes, - 'csm' : buildCSM, - 'abc' : buildABC} - - data=fns[src]() - - print('H1s:\n--------------') - for h in data.h1Arr: - print(h.title) - - print('\n\nH2s:\n--------------') - for h in data.h2Arr: - print(h.title) - - print('\n\nH3s:\n--------------') - for h in data.h3Arr: - print(h.title) - - print('\n\n') - - - -if __name__=='__main__': - spotCheck(sys.argv[1]) diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py deleted file mode 100644 index b07245c..0000000 --- a/unbiased/unbiasedFunctions.py +++ /dev/null @@ -1,241 +0,0 @@ -import html -import io -import logging -import os -import pkgutil -import random -import re -import shutil -import time -import urllib.parse - -from PIL import Image -import requests - -logger = logging.getLogger('unbiased') - -#take in a url and delimiters, return twitter card -def buildArticle(url, sourceName, encoding=None):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd): - - debugging=False - if debugging: - logger.debug(sourceName) - logger.debug(url) - - url_parts = urllib.parse.urlparse(url) - scheme = url_parts.scheme - - #download url - try: - res = requests.get(url) - except Exception as ex: - logger.debug("""ARTICLE DOWNLOADING ERROR - SOURCE:\t{} - URL:\t{}""".format(sourceName, url)) - return None - - if res.status_code == 200: - content = res.text - else: - logger.debug("""ARTICLE DOWNLOADING ERROR - SOURCE:\t{} - URL:\t{}""".format(sourceName, url)) - return None - - try: - if sourceName=='The Guardian US': - #The Guardian puts an identifying banner on their og:images - #grab the main image from the page instead - - #scenario 1: regular image - if '')[0] - elif sourceName=='ABC News': - img='https://c1.staticflickr.com/7/6042/6276688407_12900948a2_b.jpgX' - if img[-1]=='/': - #because the quote separator could be ' or ", - #trim to just before it then lop it off - img=img[:-1].strip() - img=img[:-1] - # fix the scheme if it's missing - img = urllib.parse.urlparse(img, scheme=scheme).geturl() - - if debugging: - logger.debug(img) - - title=content.split('og:title" content=')[1][1:].split('>')[0] - if title[-1]=='/': - title=title[:-1].strip() - title=title[:-1] - - if debugging: - logger.debug(title) - - - author='' - if sourceName=='The Blaze': - if 'class="article-author">' in content: - author=content.split('class="article-author">')[1].split('<')[0] - elif 'class="article-author" href="' in content: - author=content.split('class="article-author" href="')[1] - author=author.split('>')[1].split('<')[0].strip() - else: - authorTags=['article:author', 'dc.creator', 'property="author'] - for tag in authorTags: - if tag in content: - author=content.split(tag+'" content=')[1][1:].split('>')[0] - author=author[:-1] - #trim an extra quotation mark for The Hill - if sourceName=='The Hill': - author=author.split('"', 1)[0] - break - - if debugging: - logger.debug(author) - - - if 'og:description' in content: - description=content.split('og:description" content=')[1][1:].split('>')[0] - if description[-1]=='/': - description=description[:-1].strip() - description=description[:-1] - else: - if sourceName=='The Hill': - description=content.split('div class="field-items"')[-1] - description=re.sub('<[^<]+?>', '', description) - description=description[1:200] - else: - logger.debug("SHOULDN'T GET HERE") - - #strip out self-references - description=description.replace(sourceName+"'s", '***') - description=description.replace(sourceName+"'", '***') - description=description.replace(sourceName, '***') - - if debugging: - logger.debug(description) - - - a=Article(html.unescape(title), url, img, html.unescape(description), sourceName, html.unescape(author)) - return a - - except Exception: - logger.debug("""ARTICLE PARSING ERROR - SOURCE:\t{} - URL:\t{}""".format(sourceName, url)) - return None - - -def pick_randoms(story_lists, length, per_source): - """ - Return a randomly chosen list of 'length' stories, picking at - most 'per_source' stories from each source. - """ - # TODO: weighting is incorrect if a source has fewer than 'per_source' articles - urandom = random.SystemRandom() - candidates = [] - for stories in story_lists: - indexes = list(range(len(stories))) - urandom.shuffle(indexes) - random_indexes = indexes[:per_source] - candidates.extend([stories[x] for x in random_indexes]) - indexes = list(range(len(candidates))) - urandom.shuffle(indexes) - random_indexes = indexes[:length] - return tuple(candidates[x] for x in random_indexes) - - -def pickStories(newsSourceArr): - h1s = pick_randoms([x.h1s for x in newsSourceArr], 4, 1) - h2s = pick_randoms([x.h2s for x in newsSourceArr], 6, 2) - h3s = pick_randoms([x.h3s for x in newsSourceArr], 12, 2) - return h1s, h2s, h3s - -def buildOutput(top_stories, middle_stories, bottom_stories): - #read in the template html file - from jinja2 import Environment, PackageLoader, select_autoescape - env = Environment( - loader=PackageLoader('unbiased', 'html_template'), - autoescape=select_autoescape(['html', 'xml']) - ) - template = env.get_template('unbiased.jinja.html') - - timestamp = time.strftime("%a, %b %-d, %-I:%M%P %Z", time.localtime()) - utime = int(time.time()) - - sourcesStr = ', '.join(set([x.source for x in top_stories] + [x.source for x in middle_stories] + [x.source for x in bottom_stories])) - - html = template.render( - timestamp = timestamp, - utime = utime, - top_stories = top_stories, - middle_stories = middle_stories, - bottom_stories = bottom_stories, - sources = sourcesStr, - ) - - #return updated text - return html - -def write_files(files_to_write, outDir): - for name, bytesio in files_to_write.items(): - with open(os.path.join(outDir, name), 'wb') as fp: - shutil.copyfileobj(bytesio, fp) - -def write_static_files(outDir): - # copy over static package files - for filename in ['unbiased.css', 'favicon.ico', 'favicon.png', 'apple-touch-icon.png']: - data = pkgutil.get_data('unbiased', os.path.join('html_template', filename)) - with open(os.path.join(outDir, filename), 'wb') as fp: - fp.write(data) - -def pullImage(url, index, webroot, target_width=350, target_height=200): - extension = url.split('.')[-1].split('?')[0] - img_name = 'img{}.{}'.format(index, extension) - res = requests.get(url) - if res.status_code == 200: - content = res.content - else: - logger.debug('Image not found: url={}'.format(url)) - return '' - img = Image.open(io.BytesIO(content)) - # crop to aspect ratio - target_ar = target_width / target_height - left, top, right, bottom = img.getbbox() - height = bottom - top - width = right - left - ar = width / height - if target_ar > ar: - new_height = (target_height / target_width) * width - bbox = (left, top + ((height - new_height) / 2), right, bottom - ((height - new_height) / 2)) - img = img.crop(bbox) - elif target_ar < ar: - new_width = (target_width / target_height) * height - bbox = (left + ((width - new_width) / 2), top, right - ((width - new_width) / 2), bottom) - img = img.crop(bbox) - # resize if larger - if target_width * 2 < width or target_height * 2 < height: - img = img.resize((target_width*2, target_height*2), Image.LANCZOS) - # TODO: fill with a neutral color instead of just discarding alpha channel - img = img.convert('RGB') - # TODO: create retina images - jpg_name = 'img{}.jpg'.format(index) - jpg_file = io.BytesIO() - out_file = os.path.join(webroot, jpg_name) - img.save(jpg_file, 'JPEG') - jpg_file.seek(0) - return jpg_name, jpg_file diff --git a/unbiased/unbiasedObjects.py b/unbiased/unbiasedObjects.py deleted file mode 100644 index 9a8a78a..0000000 --- a/unbiased/unbiasedObjects.py +++ /dev/null @@ -1,97 +0,0 @@ -import logging - -logger = logging.getLogger('unbiased') - -class Article(): - title='' - url='' - img='' - description='' - source='' - author='' - - def __init__(self, title, url, img, description, source, author): - self.title=title - self.url=url - self.img=img - self.description=description - self.source=source - self.author=author - - def __str__(self): - return '-----------\ntitle: {}\nauthor: {}\nsource: {}\ndescription: {}\nurl: {}\nimg: {}\n-----------'.format(self.title, self.author, self.source, self.description, self.url, self.img) - - def __repr__(self): - return '{}({}, {}, {})'.format(self.source.replace(' ', ''), self.title, self.author, self.url) - - -class NewsSource2(): - name='' - url='' - h1Arr=[] - h2Arr=[] - h3Arr=[] - def __init__(self, name, url, h1Arr, h2Arr, h3Arr): - self.name=name - self.url=url - self.h1Arr=h1Arr - self.h2Arr=h2Arr - self.h3Arr=h3Arr - - - -class NewsSource(): - name='' - url='' - #multiple start values to step through file. end value default to '"' - h1SectionDividerStart=None - h1SectionDividerEnd=None - h1DelStart=[] - h1DelEnd='"' - h2SectionDividerStart=None - h2SectionDividerEnd=None - h2DelStart=[] - h2DelEnd='"' - h3SectionDividerStart=None - h3SectionDividerEnd=None - h3DelStart=[] - h3DelEnd='"' - #arrays of Article object types - h1Arr=None - h2Arr=None - h3Arr=None - #url to attach to stub links - stubURL='' - - def __init__(self, name, url, - h1DelStart, h2DelStart, h3DelStart, - h1SectionDividerStart=None, h1SectionDividerEnd=None, - h2SectionDividerStart=None, h2SectionDividerEnd=None, - h3SectionDividerStart=None, h3SectionDividerEnd=None, - stubURL=None): - self.name=name - self.url=url - self.h1DelStart=h1DelStart - self.h2DelStart=h2DelStart - self.h3DelStart=h3DelStart - self.h1SectionDividerStart=h1SectionDividerStart - self.h2SectionDividerStart=h2SectionDividerStart - self.h3SectionDividerStart=h3SectionDividerStart - self.h1SectionDividerEnd=h1SectionDividerEnd - self.h2SectionDividerEnd=h2SectionDividerEnd - self.h3SectionDividerEnd=h3SectionDividerEnd - self.h1Arr=[] - self.h2Arr=[] - self.h3Arr=[] - self.stubURL=stubURL - - def addArticle(self, article, level): - if level==1: - self.h1Arr.append(article) - elif level==2: - self.h2Arr.append(article) - elif level==3: - self.h3Arr.append(article) - else: - logger.debug("Invalid level in NewsSource.addArtlce: " + level) - diff --git a/unbiased/util.py b/unbiased/util.py new file mode 100644 index 0000000..12003b1 --- /dev/null +++ b/unbiased/util.py @@ -0,0 +1,113 @@ +import io +import logging +import os +import pkgutil +import random +import shutil +import time + +from PIL import Image +import requests + +logger = logging.getLogger('unbiased') + + +def pick_randoms(story_lists, length, per_source): + """ + Return a randomly chosen list of 'length' stories, picking at + most 'per_source' stories from each source. + """ + # TODO: weighting is incorrect if a source has fewer than 'per_source' articles + urandom = random.SystemRandom() + candidates = [] + for stories in story_lists: + indexes = list(range(len(stories))) + urandom.shuffle(indexes) + random_indexes = indexes[:per_source] + candidates.extend([stories[x] for x in random_indexes]) + indexes = list(range(len(candidates))) + urandom.shuffle(indexes) + random_indexes = indexes[:length] + return tuple(candidates[x] for x in random_indexes) + + +def pickStories(newsSourceArr): + h1s = pick_randoms([x.h1s for x in newsSourceArr], 4, 1) + h2s = pick_randoms([x.h2s for x in newsSourceArr], 6, 2) + h3s = pick_randoms([x.h3s for x in newsSourceArr], 12, 2) + return h1s, h2s, h3s + + +def buildOutput(top_stories, middle_stories, bottom_stories): + # read in the template html file + from jinja2 import Environment, PackageLoader, select_autoescape + env = Environment( + loader=PackageLoader('unbiased', 'html_template'), + autoescape=select_autoescape(['html', 'xml']) + ) + template = env.get_template('unbiased.jinja.html') + + timestamp = time.strftime("%a, %b %-d, %-I:%M%P %Z", time.localtime()) + utime = int(time.time()) + + sourcesStr = ', '.join(set([x.source for x in top_stories] + [x.source for x in middle_stories] + [x.source for x in bottom_stories])) + + html = template.render( + timestamp=timestamp, + utime=utime, + top_stories=top_stories, + middle_stories=middle_stories, + bottom_stories=bottom_stories, + sources=sourcesStr, + ) + + return html + + +def write_files(files_to_write, outDir): + for name, bytesio in files_to_write.items(): + with open(os.path.join(outDir, name), 'wb') as fp: + shutil.copyfileobj(bytesio, fp) + + +def write_static_files(outDir): + # copy over static package files + for filename in ['unbiased.css', 'favicon.ico', 'favicon.png', 'apple-touch-icon.png']: + data = pkgutil.get_data('unbiased', os.path.join('html_template', filename)) + with open(os.path.join(outDir, filename), 'wb') as fp: + fp.write(data) + + +def pullImage(url, index, webroot, target_width=350, target_height=200): + res = requests.get(url) + if res.status_code == 200: + content = res.content + else: + logger.debug('Image not found: url={}'.format(url)) + return '' + img = Image.open(io.BytesIO(content)) + # crop to aspect ratio + target_ar = target_width / target_height + left, top, right, bottom = img.getbbox() + height = bottom - top + width = right - left + ar = width / height + if target_ar > ar: + new_height = (target_height / target_width) * width + bbox = (left, top + ((height - new_height) / 2), right, bottom - ((height - new_height) / 2)) + img = img.crop(bbox) + elif target_ar < ar: + new_width = (target_width / target_height) * height + bbox = (left + ((width - new_width) / 2), top, right - ((width - new_width) / 2), bottom) + img = img.crop(bbox) + # resize if larger + if target_width * 2 < width or target_height * 2 < height: + img = img.resize((target_width * 2, target_height * 2), Image.LANCZOS) + # TODO: fill with a neutral color instead of just discarding alpha channel + img = img.convert('RGB') + # TODO: create retina images + jpg_name = 'img{}.jpg'.format(index) + jpg_file = io.BytesIO() + img.save(jpg_file, 'JPEG') + jpg_file.seek(0) + return jpg_name, jpg_file -- cgit v1.2.3