diff options
-rw-r--r-- | setup.py | 1 | ||||
-rwxr-xr-x | unbiased/html_template/unbiased.css | 8 | ||||
-rwxr-xr-x | unbiased/main.py | 2 | ||||
-rwxr-xr-x | unbiased/parser.py | 29 | ||||
-rw-r--r-- | unbiased/unbiasedFunctions.py | 98 |
5 files changed, 49 insertions, 89 deletions
@@ -13,6 +13,7 @@ setup( install_requires=[ 'jinja2', 'Pillow', + 'requests', ], entry_points={ 'console_scripts': [ diff --git a/unbiased/html_template/unbiased.css b/unbiased/html_template/unbiased.css index 6817cc9..1424ee9 100755 --- a/unbiased/html_template/unbiased.css +++ b/unbiased/html_template/unbiased.css @@ -108,8 +108,8 @@ a:hover{ width:350px;
height:200px;
overflow:hidden;
- background-size: auto 234px;/*cover;*/
- background-position: top center;/*center center;*/
+ background-size: 100%;
+ background-position: center center;
margin:0 auto;
}
@@ -169,8 +169,8 @@ a:hover{ width:150px;
height:100px;
overflow:hidden;
- background-size: auto 117px;/*cover;*/
- background-position: top center;/*center center;*/
+ background-size: 100%;
+ background-position: center center;
float:left;
max-width:35%;
}
diff --git a/unbiased/main.py b/unbiased/main.py index c8a113e..c760788 100755 --- a/unbiased/main.py +++ b/unbiased/main.py @@ -76,7 +76,7 @@ def run(webroot, scratch): logger.debug('Build failed, retrying. source={} ex={}'.format(source, ex)) #scrape all urls and build data structure - newsSourceArr=buildNewsSourceArr(sourceList, scratch) + newsSourceArr = sourceList #build the output file HTML outputHTML=buildOutput(newsSourceArr, webroot, scratch) diff --git a/unbiased/parser.py b/unbiased/parser.py index 2bba27d..0a8398c 100755 --- a/unbiased/parser.py +++ b/unbiased/parser.py @@ -4,6 +4,9 @@ import logging import os
import re
import subprocess
+import urllib.parse
+
+import requests
from unbiased.unbiasedObjects import *
from unbiased.unbiasedFunctions import buildArticle
@@ -16,21 +19,11 @@ Takes in a URL, downloads the file to a temp file, reads the file into a string, and returns that string
'''
def urlToContent(url, scratchDir, sourceEncoding='utf8'):
- temp_file = os.path.join(scratchDir, 'temp1.html')
-
- #download file
- #os.system('wget -q -O scratch/temp1.html --no-check-certificate '+url)
- subprocess.check_call(['wget', '-q', '-O', temp_file, '--no-check-certificate', url])
-
- #read file
- if sourceEncoding=='utf8':
- f=open(temp_file, 'r', encoding="utf8")
+ res = requests.get(url)
+ if res.status_code == 200:
+ return res.text
else:
- f=open(temp_file, 'r', encoding="latin-1")
- content=f.read()
- f.close()
-
- return content
+ raise Exception("Failed to download {}".format(url))
'''
@@ -39,6 +32,13 @@ calls the file scraper and appends the new Article object. Returns a newsSource2 object
'''
def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir):
+
+ url_parts = urllib.parse.urlparse(url)
+ scheme = url_parts.scheme
+ h1URLs = [urllib.parse.urlparse(x, scheme=scheme).geturl() for x in h1URLs]
+ h2URLs = [urllib.parse.urlparse(x, scheme=scheme).geturl() for x in h2URLs]
+ h3URLs = [urllib.parse.urlparse(x, scheme=scheme).geturl() for x in h3URLs]
+
h1Arr=[]
a=buildArticle(h1URLs[0], name, scratchDir)
if a==None:
@@ -54,7 +54,6 @@ def buildNewsSource2(name, url, h1URLs, h2URLs, h3URLs, scratchDir): else:
logger.debug('H2 Nonetype in '+name)
-
h3Arr=[]
for x in h3URLs:
a=buildArticle(x, name, scratchDir)
diff --git a/unbiased/unbiasedFunctions.py b/unbiased/unbiasedFunctions.py index 415a3cc..0181beb 100644 --- a/unbiased/unbiasedFunctions.py +++ b/unbiased/unbiasedFunctions.py @@ -1,4 +1,5 @@ import html
+import io
import logging
import os
import pkgutil
@@ -6,10 +7,12 @@ import random import re
import subprocess
import time
-
-from unbiased.unbiasedObjects import *
+import urllib.parse
from PIL import Image
+import requests
+
+from unbiased.unbiasedObjects import *
logger = logging.getLogger('unbiased')
@@ -21,16 +24,25 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t logger.debug(sourceName)
logger.debug(url)
- temp_article = os.path.join(scratchDir, 'temp_article.html')
+ url_parts = urllib.parse.urlparse(url)
+ scheme = url_parts.scheme
#download url
- #os.system('wget -q -O scratch/temp_article.html --no-check-certificate '+url)
- subprocess.check_call(['wget', '-q', '-O', temp_article, '--no-check-certificate', url])
+ try:
+ res = requests.get(url)
+ except Exception as ex:
+ logger.error("""ARTICLE DOWNLOADING ERROR
+ SOURCE:\t{}
+ URL:\t{}""".format(sourceName, url))
+ return None
- #read the file in
- f=open(temp_article, 'r', encoding="utf8")
- content=f.read()
- f.close()
+ if res.status_code == 200:
+ content = res.text
+ else:
+ logger.error("""ARTICLE DOWNLOADING ERROR
+ SOURCE:\t{}
+ URL:\t{}""".format(sourceName, url))
+ return None
try:
if sourceName=='The Guardian US':
@@ -61,6 +73,8 @@ def buildArticle(url, sourceName, scratchDir, encoding=None):#, titleDelStart, t #trim to just before it then lop it off
img=img[:-1].strip()
img=img[:-1]
+ # fix the scheme if it's missing
+ img = urllib.parse.urlparse(img, scheme=scheme).geturl()
if debugging:
logger.debug(img)
@@ -232,68 +246,16 @@ def printOutputHTML(outputHTML, outDir): with open(os.path.join(outDir, filename), 'wb') as fp:
fp.write(data)
-def buildNewsSourceArr(sourceList, scratchDir):
-
- #build the data structure
- i=0
- listLen=len(sourceList)
- while i < listLen:
- source=sourceList[i]
-
- if type(source) is NewsSource2:
- i+=1
- continue
-
- url=source.url
-
- temp_file = os.path.join(scratchDir, 'temp{}.html'.format(i))
-
- #download file
- #os.system('wget -q -O scratch/temp'+str(i)+'.html --no-check-certificate '+url)
- subprocess.check_call(['wget', '-q', '-O', temp_file, '--no-check-certificate', url])
-
- #read file
- f=open(temp_file, 'r', encoding="utf8")
- content=f.read()
- f.close()
-
- #delete file MAYBE DON'T DO THIS? CAUSES OS ERRORS
- #os.remove(temp_file)
-
- #add stories etc to the NewsSource object
- h1s, h2s, h3s=extractURLs(content, source)
-
- #build the Article objects and add to newsSource's appropriate list
- if h1s!=None and h2s!=None:
- for url in h1s:
- article=buildArticle(url, source.name, scratchDir)
- if article!=None: source.addArticle(article, 1) #sourceList[i].h1Arr.append(article)
- for url in h2s:
- article=buildArticle(url, source.name, scratchDir)
- if article!=None: sourceList[i].h2Arr.append(article)
- for url in h3s:
- article=buildArticle(url, source.name, scratchDir)
- if article!=None: sourceList[i].h3Arr.append(article)
- i+=1
- else:
- sourceList.remove(source)
- listLen-=1
-
-
- #return the original sourceList,
- #since everything should have been modified in place
- return sourceList
-
def pullImage(url, index, webroot, scratch, target_width=350, target_height=200):
extension = url.split('.')[-1].split('?')[0]
img_name = 'img{}.{}'.format(index, extension)
- tmp_file = os.path.join(scratch, img_name)
- try:
- subprocess.check_call(['wget', '-q', '-O', tmp_file, '--no-check-certificate', url])
- except Exception as ex:
- logger.error('Failed to pull image: url={} ex={}'.format(url, ex))
+ res = requests.get(url)
+ if res.status_code == 200:
+ content = res.content
+ else:
+ logger.error('Image not found: url={}'.format(url))
return ''
- img = Image.open(tmp_file)
+ img = Image.open(io.BytesIO(content))
# crop to aspect ratio
target_ar = target_width / target_height
left, top, right, bottom = img.getbbox()
@@ -315,6 +277,4 @@ def pullImage(url, index, webroot, scratch, target_width=350, target_height=200) jpg_name = 'img{}.jpg'.format(index)
out_file = os.path.join(webroot, jpg_name)
img.save(out_file, 'JPEG')
- if tmp_file != out_file:
- os.remove(tmp_file)
return jpg_name
|