From 7a8efb94dc2463a6d30afc77f10df78ebfa4c353 Mon Sep 17 00:00:00 2001 From: Matt Singleton Date: Wed, 19 Apr 2017 16:39:03 -0400 Subject: replace wget with requests library --- setup.py | 1 + unbiased/html_template/unbiased.css | 8 +-- unbiased/main.py | 2 +- unbiased/parser.py | 29 ++++++----- unbiased/unbiasedFunctions.py | 98 +++++++++++-------------------------- 5 files changed, 49 insertions(+), 89 deletions(-) diff --git a/setup.py b/setup.py index 2755304..57c27c0 100644 --- a/setup.py +++ b/setup.py @@ -13,6 +13,7 @@ setup( install_requires=[ 'jinja2', 'Pillow', + 'requests', ], entry_points={ 'console_scripts': [ diff --git a/unbiased/html_template/unbiased.css b/unbiased/html_template/unbiased.css index 6817cc9..1424ee9 100755 --- a/unbiased/html_template/unbiased.css +++ b/unbiased/html_template/unbiased.css @@ -108,8 +108,8 @@ a:hover{ width:350px; height:200px; overflow:hidden; - background-size: auto 234px;/*cover;*/ - background-position: top center;/*center center;*/ + background-size: 100%; + background-position: center center; margin:0 auto; } @@ -169,8 +169,8 @@ a:hover{ width:150px; height:100px; overflow:hidden; - background-size: auto 117px;/*cover;*/ - background-position: top center;/*center center;*/ + background-size: 100%; + background-position: center center; float:left; max-width:35%; } diff --git a/unbiased/main.py b/unbiased/main.py index c8a113e..c760788 100755 --- a/unbiased/main.py +++ b/unbiased/main.py @@ -76,7 +76,7 @@ def run(webroot, scratch): logger.debug('Build failed, retrying. source={} ex={}'.format(source, ex)) #scrape all urls and build data structure - newsSourceArr=buildNewsSourceArr(sourceList, scratch) + newsSourceArr = sourceList #build the output file HTML outputHTML=buildOutput(newsSourceArr, webroot, scratch) diff --git a/unbiased/parser.py b/unbiased/parser.py index 2bba27d..0a8398c 100755 --- a/unbiased/parser.py +++ b/unbiased/parser.py @@ -4,6 +4,9 @@ import logging import os import re import subprocess +import urllib.parse + +import requests from unbiased.unbiasedObjects import * from unbiased.unbiasedFunctions import buildArticle @@ -16,21 +19,11 @@ Takes in a URL, downloads the file to a temp file, reads the file into a string, and returns that string ''' def urlToContent(url, scratchDir, sourceEncoding='utf8'): - temp_file = os.path.join(scratchDir, 'temp1.html') - - #download file - #os.system('wget -q -O scratch/temp1.html --no-check-certificate '+url) - subprocess.check_call(['wget', '-q', '-O', temp_file, '--no-check-certificate', url]) - - #read file - if sourceEncoding=='utf8': - f=open(temp_file, 'r', encoding="utf8") + res = requests.get(url) + if res.status_code == 200: + return res.text else: - f=open(temp_file, 'r', encoding="latin-1") - content=f.read() - f.close() - - return content + raise Exception("Failed to download {}".format(url)) ''' @@ -39,6 +32,13 @@ calls the file scraper and appends the new Article object. Returns a newsSource2 object ''' def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir): + + url_parts = urllib.parse.urlparse(url) + scheme = url_parts.scheme + h1URLs = [urllib.parse.urlparse(x, scheme=scheme).geturl() for x in h1URLs] + h2URLs = [urllib.parse.urlparse(x, scheme=scheme).geturl() for x in h2URLs] + h3URLs = [urllib.parse.urlparse(x, scheme=scheme).geturl() for x in h3URLs] + h1Arr=[] a=buildArticle(h1URLs[0], name, scratchDir) if a==None: @@ -54,7 +54,6 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir): else: logger.debug('H2 Nonetype in '+name) - h3Arr=[] for x in h3URLs: a=buildArticle(x, name, scratchDir) diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py index 415a3cc..0181beb 100644 --- a/unbiased/unbiasedFunctions.py +++ b/unbiased/unbiasedFunctions.py @@ -1,4 +1,5 @@ import html +import io import logging import os import pkgutil @@ -6,10 +7,12 @@ import random import re import subprocess import time - -from unbiased.unbiasedObjects import * +import urllib.parse from PIL import Image +import requests + +from unbiased.unbiasedObjects import * logger = logging.getLogger('unbiased') @@ -21,16 +24,25 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t logger.debug(sourceName) logger.debug(url) - temp_article = os.path.join(scratchDir, 'temp_article.html') + url_parts = urllib.parse.urlparse(url) + scheme = url_parts.scheme #download url - #os.system('wget -q -O scratch/temp_article.html --no-check-certificate '+url) - subprocess.check_call(['wget', '-q', '-O', temp_article, '--no-check-certificate', url]) + try: + res = requests.get(url) + except Exception as ex: + logger.error("""ARTICLE DOWNLOADING ERROR + SOURCE:\t{} + URL:\t{}""".format(sourceName, url)) + return None - #read the file in - f=open(temp_article, 'r', encoding="utf8") - content=f.read() - f.close() + if res.status_code == 200: + content = res.text + else: + logger.error("""ARTICLE DOWNLOADING ERROR + SOURCE:\t{} + URL:\t{}""".format(sourceName, url)) + return None try: if sourceName=='The Guardian US': @@ -61,6 +73,8 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t #trim to just before it then lop it off img=img[:-1].strip() img=img[:-1] + # fix the scheme if it's missing + img = urllib.parse.urlparse(img, scheme=scheme).geturl() if debugging: logger.debug(img) @@ -232,68 +246,16 @@ def printOutputHTML(outputHTML, outDir): with open(os.path.join(outDir, filename), 'wb') as fp: fp.write(data) -def buildNewsSourceArr(sourceList, scratchDir): - - #build the data structure - i=0 - listLen=len(sourceList) - while i < listLen: - source=sourceList[i] - - if type(source) is NewsSource2: - i+=1 - continue - - url=source.url - - temp_file = os.path.join(scratchDir, 'temp{}.html'.format(i)) - - #download file - #os.system('wget -q -O scratch/temp'+str(i)+'.html --no-check-certificate '+url) - subprocess.check_call(['wget', '-q', '-O', temp_file, '--no-check-certificate', url]) - - #read file - f=open(temp_file, 'r', encoding="utf8") - content=f.read() - f.close() - - #delete file MAYBE DON'T DO THIS? CAUSES OS ERRORS - #os.remove(temp_file) - - #add stories etc to the NewsSource object - h1s, h2s, h3s=extractURLs(content, source) - - #build the Article objects and add to newsSource's appropriate list - if h1s!=None and h2s!=None: - for url in h1s: - article=buildArticle(url, source.name, scratchDir) - if article!=None: source.addArticle(article, 1) #sourceList[i].h1Arr.append(article) - for url in h2s: - article=buildArticle(url, source.name, scratchDir) - if article!=None: sourceList[i].h2Arr.append(article) - for url in h3s: - article=buildArticle(url, source.name, scratchDir) - if article!=None: sourceList[i].h3Arr.append(article) - i+=1 - else: - sourceList.remove(source) - listLen-=1 - - - #return the original sourceList, - #since everything should have been modified in place - return sourceList - def pullImage(url, index, webroot, scratch, target_width=350, target_height=200): extension = url.split('.')[-1].split('?')[0] img_name = 'img{}.{}'.format(index, extension) - tmp_file = os.path.join(scratch, img_name) - try: - subprocess.check_call(['wget', '-q', '-O', tmp_file, '--no-check-certificate', url]) - except Exception as ex: - logger.error('Failed to pull image: url={} ex={}'.format(url, ex)) + res = requests.get(url) + if res.status_code == 200: + content = res.content + else: + logger.error('Image not found: url={}'.format(url)) return '' - img = Image.open(tmp_file) + img = Image.open(io.BytesIO(content)) # crop to aspect ratio target_ar = target_width / target_height left, top, right, bottom = img.getbbox() @@ -315,6 +277,4 @@ def pullImage(url, index, webroot, scratch, target_width=350, target_height=200) jpg_name = 'img{}.jpg'.format(index) out_file = os.path.join(webroot, jpg_name) img.save(out_file, 'JPEG') - if tmp_file != out_file: - os.remove(tmp_file) return jpg_name -- cgit v1.2.3