#!/usr/bin/env python3 import argparse import io import logging import logging.config import time from unbiased.unbiasedFunctions import pickStories, pullImage, buildOutput, write_files, write_static_files from unbiased.sources import get_sources logger = logging.getLogger('unbiased') logging_config = { 'version': 1, 'formatters': { 'console': { 'format': '%(levelname)s %(filename)s:%(lineno)d %(message)s', }, 'file': { 'format': '%(asctime)s %(levelname)s %(filename)s:%(lineno)d %(message)s', }, }, 'handlers': { 'console': { 'class': 'logging.StreamHandler', 'level': 'INFO', 'formatter': 'console', }, 'file': { 'class': 'logging.handlers.RotatingFileHandler', 'level': 'DEBUG', 'formatter': 'file', 'filename': '', 'maxBytes': 1024 * 1024, 'backupCount': 3, }, }, 'loggers': { 'unbiased': { 'handlers': ['console', 'file'], }, }, 'root': { 'level': 'DEBUG', } } def main(): parser = argparse.ArgumentParser() parser.add_argument('webroot', help='location to write html output') parser.add_argument('-l', '--log-dir', help='location to write detailed logs') parser.add_argument('-d', '--debug', action='store_true', help='run in debug mode') parser.add_argument('-o', '--oneshot', action='store_true', help='run once and exit') parser.add_argument('-s', '--sources', type=lambda x: x.split(','), default=None) args = parser.parse_args() if args.log_dir: logging_config['handlers']['file']['filename'] = os.path.join(args.log_dir, 'unbiased.debug.log') else: logging_config['loggers']['unbiased']['handlers'].remove('file') del logging_config['handlers']['file'] if args.debug: logging_config['handlers']['console']['level'] = 'DEBUG' logging.config.dictConfig(logging_config) crawl_frequency = 600 while True: logger.info('Starting crawl') start = time.time() run(args.webroot, args.sources) finish = time.time() runtime = finish - start sleeptime = crawl_frequency - runtime logger.info('Crawl complete in {}s. Sleeping for {}s'.format(int(runtime), int(sleeptime))) if args.oneshot: break if sleeptime > 0: time.sleep(sleeptime) def run(webroot, source_names): logger.debug('Running with webroot="{}" for sources="{}"'.format(webroot, source_names)) sources = get_sources() print(sources) if source_names is None: sources = sources.values() else: sources = [sources[x] for x in source_names] built_sources = [] for source in sources: logger.info('Crawling {}'.format(source.name)) tries = 0 while tries < 3: time.sleep(tries) try: built_sources.append(source.build()) break except Exception as ex: tries += 1 if tries == 3: logger.error('Build failed. source={} ex={}'.format(source.name, ex)) else: logger.debug('Build failed, retrying. source={} ex={}'.format(source.name, ex)) sources = tuple(built_sources) logger.info('Parsed home pages for: {}'.format([x.name for x in sources])) top_stories, middle_stories, bottom_stories = pickStories(sources) logger.info('Picked top stories from: {}'.format([x.source for x in top_stories])) logger.info('Picked middle stories from: {}'.format([x.source for x in middle_stories])) logger.info('Picked bottom stories from: {}'.format([x.source for x in bottom_stories])) files_to_write = {} # download images img_idx = 0 for story in top_stories: story.img, img_jpg = pullImage(story.img, img_idx, webroot, 350, 200) files_to_write[story.img] = img_jpg img_idx += 1 for story in middle_stories: story.img, img_jpg = pullImage(story.img, img_idx, webroot, 150, 100) files_to_write[story.img] = img_jpg img_idx += 1 # build the output file HTML output_html = buildOutput(top_stories, middle_stories, bottom_stories) output_html = io.BytesIO(output_html.encode('utf8')) files_to_write['index.html'] = output_html write_files(files_to_write, webroot) write_static_files(webroot) if __name__=="__main__": main()