From 7ceea6a5a495302ffdec9921ea9f841a2b6df8c2 Mon Sep 17 00:00:00 2001 From: sstvinc2 Date: Tue, 14 Feb 2017 21:02:29 -0600 Subject: New parsing method started Got NYT up and running with new object type and custom parser --- html_template/unbiased.css | 11 ++-- main.py | 8 +++ parser.py | 151 +++++++++++++++++++++++++++++++++++++++++++++ unbiasedFunctions.py | 41 ++---------- unbiasedObjects.py | 15 +++++ 5 files changed, 187 insertions(+), 39 deletions(-) create mode 100644 parser.py diff --git a/html_template/unbiased.css b/html_template/unbiased.css index b632cdd..86f653a 100644 --- a/html_template/unbiased.css +++ b/html_template/unbiased.css @@ -19,8 +19,8 @@ a:link, a:visited, a:hover, a:active { text-align:center; padding:.5em 0 1em; margin-bottom:1em; - border-bottom:1px solid #000; - background:#fdf; + border-bottom:3px solid #f00; + background:#44f; } .title{ @@ -29,15 +29,16 @@ a:link, a:visited, a:hover, a:active { #title-1{ font-style:italic; - color:#d00; + color:#fff; } #title-2{ - color:#00d; + color:#fff; } #subtitle{ font-size:1.25em; + color:#ccc; } #timestamp{ @@ -59,6 +60,8 @@ a:link, a:visited, a:hover, a:active { margin-left:auto; margin-right:auto; text-align:center; + border-bottom: 3px solid #f00; + margin-bottom: 10px; } .top-story{ diff --git a/main.py b/main.py index 163f73a..63fd908 100644 --- a/main.py +++ b/main.py @@ -2,6 +2,7 @@ from unbiasedObjects import * from unbiasedFunctions import * +from parser import * import time def main(): @@ -67,7 +68,10 @@ def run(): None, None)) + nyt=buildNYT() + sourceList.append(nyt) + ''' sourceList.append(NewsSource('New York Times', 'http://nytimes.com', ['', '', 'section id="top-news" class="top-news"', '', 'class="second-column-region region"', 'html.geo-dma-501 .nythpNYRegionPromo')) + ''' + + + sourceList.append(NewsSource('Fox News', 'http://foxnews.com', diff --git a/parser.py b/parser.py new file mode 100644 index 0000000..2020f55 --- /dev/null +++ b/parser.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python3 + +from unbiasedObjects import * +from unbiasedFunctions import buildArticle +import os + +def buildNYT(): + url='http://www.nytimes.com' + + #download file + os.system('wget -q -O scratch/temp1.html --no-check-certificate '+url) + + #read file + f=open('scratch/temp1.html', 'r')#, encoding="utf8") + content=f.read() + f.close() + + #get main headline + #this will likely need if/else logic + h1=content + + #This is with a large headline over a and b columns + h1=h1.split('story theme-summary banner', 1)[1] + h1=h1.split('', 1)[1] + h2=h2.split('', 1)[0] + #remove "collection" sets + while '
' in h2: + arr=h2.split('
', 1) + h2=arr[0]+arr[1].split('', 1)[1] + #Grab the remaining URLs + while '', 1)[1] + h2=h2.split('', 1)[0] + #remove "collection" sets + while '
' in h2: + arr=h2.split('
', 1) + h2=arr[0]+arr[1].split('', 1)[1] + #Grab the remaining URLs + while '', 1)[1] + h3=h3.split('', 1)[0] + #remove "collection" sets + while '
' in h2: + arr=h3.split('
', 1) + h3=arr[0]+arr[1].split('', 1)[1] + + #Grab the remaining URLs + while ' + +
+ +
+

Top News

+ +
+ + +''' diff --git a/unbiasedFunctions.py b/unbiasedFunctions.py index c2f62c0..ef6ae7c 100644 --- a/unbiasedFunctions.py +++ b/unbiasedFunctions.py @@ -3,6 +3,7 @@ import os import random import time + #take in a url and delimiters, return twitter card def buildArticle(url, sourceName):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd): @@ -44,41 +45,6 @@ def buildArticle(url, sourceName):#, titleDelStart, titleDelEnd, imgDelStart, im print("Article parsing error in buildArticle() for URL: "+url) return None -#do the hardcore HTML parsing -def splitHTML(content, sectionDividerStart, sectionDividerEnd, delStart, delEnd): - retArr=[] - - if sectionDividerStart!=None: - content=content.split(sectionDividerStart)[1] - if sectionDividerEnd!=None: - content=content.split(sectionDividerEnd)[0] - if delStart!=[]: - while True: - x=content - for delim in delStart: - if delim in content: - x=content.split(delim)[1] - x=x.split(delEnd)[0] - if x not in retArr: - retArr.append(x) - content=content.split(delStart[0], 1) - if(len(content)==1): - break - else: - content=content[1:][0] - - return retArr - - - -''' -**********************8 - -Need to fix this function to use splitHTML() and actually loop through -all of the links instead of just using the first one. - -************************ -''' #take in a read main source file (e.g. from nytimes.com) and return lists of the urls for stories def extractURLs(content, source): @@ -202,6 +168,11 @@ def buildNewsSourceArr(sourceList): listLen=len(sourceList) while i < listLen: source=sourceList[i] + + if type(source) is NewsSource2: + i+=1 + continue + url=source.url #download file diff --git a/unbiasedObjects.py b/unbiasedObjects.py index b1f6ec5..2233b0c 100644 --- a/unbiasedObjects.py +++ b/unbiasedObjects.py @@ -15,6 +15,21 @@ class Article(): def __str__(self): return '-----------\n'+self.title+'\n'+self.source+'\n'+self.description+'\n'+self.url+'\n'+self.img+'\n'+'-----------' + +class NewsSource2(): + name='' + url='' + h1Arr=[] + h2Arr=[] + h3Arr=[] + def __init__(self, name, url, h1Arr, h2Arr, h3Arr): + self.name=name + self.url=url + self.h1Arr=h1Arr + self.h2Arr=h2Arr + self.h3Arr=h3Arr + + class NewsSource(): name='' -- cgit v1.2.3