diff options
author | sstvinc2 <sstvinc2@gmail.com> | 2017-02-14 21:02:29 -0600 |
---|---|---|
committer | sstvinc2 <sstvinc2@gmail.com> | 2017-02-14 21:02:29 -0600 |
commit | 7ceea6a5a495302ffdec9921ea9f841a2b6df8c2 (patch) | |
tree | 15df09ca79207e0c87a7460adbe1476b1627e634 | |
parent | 82166863a0c4a8c101d041123c4ac2f098c9ef9a (diff) |
New parsing method started
Got NYT up and running with new object type and custom parser
-rw-r--r-- | html_template/unbiased.css | 11 | ||||
-rw-r--r-- | main.py | 8 | ||||
-rw-r--r-- | parser.py | 151 | ||||
-rw-r--r-- | unbiasedFunctions.py | 41 | ||||
-rw-r--r-- | unbiasedObjects.py | 15 |
5 files changed, 187 insertions, 39 deletions
diff --git a/html_template/unbiased.css b/html_template/unbiased.css index b632cdd..86f653a 100644 --- a/html_template/unbiased.css +++ b/html_template/unbiased.css @@ -19,8 +19,8 @@ a:link, a:visited, a:hover, a:active { text-align:center;
padding:.5em 0 1em;
margin-bottom:1em;
- border-bottom:1px solid #000;
- background:#fdf;
+ border-bottom:3px solid #f00;
+ background:#44f;
}
.title{
@@ -29,15 +29,16 @@ a:link, a:visited, a:hover, a:active { #title-1{
font-style:italic;
- color:#d00;
+ color:#fff;
}
#title-2{
- color:#00d;
+ color:#fff;
}
#subtitle{
font-size:1.25em;
+ color:#ccc;
}
#timestamp{
@@ -59,6 +60,8 @@ a:link, a:visited, a:hover, a:active { margin-left:auto;
margin-right:auto;
text-align:center;
+ border-bottom: 3px solid #f00;
+ margin-bottom: 10px;
}
.top-story{
@@ -2,6 +2,7 @@ from unbiasedObjects import * from unbiasedFunctions import * +from parser import * import time def main(): @@ -67,7 +68,10 @@ def run(): None, None)) + nyt=buildNYT() + sourceList.append(nyt) + ''' sourceList.append(NewsSource('New York Times', 'http://nytimes.com', ['<a href="'], @@ -76,6 +80,10 @@ def run(): '<div class="b-column column">', '<!-- close photo-spot-region -->', 'section id="top-news" class="top-news"', '</div><!-- close a-column -->', 'class="second-column-region region"', 'html.geo-dma-501 .nythpNYRegionPromo')) + ''' + + + sourceList.append(NewsSource('Fox News', 'http://foxnews.com', diff --git a/parser.py b/parser.py new file mode 100644 index 0000000..2020f55 --- /dev/null +++ b/parser.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python3 + +from unbiasedObjects import * +from unbiasedFunctions import buildArticle +import os + +def buildNYT(): + url='http://www.nytimes.com' + + #download file + os.system('wget -q -O scratch/temp1.html --no-check-certificate '+url) + + #read file + f=open('scratch/temp1.html', 'r')#, encoding="utf8") + content=f.read() + f.close() + + #get main headline + #this will likely need if/else logic + h1=content + + #This is with a large headline over a and b columns + h1=h1.split('story theme-summary banner', 1)[1] + h1=h1.split('<a href="', 1)[1] + h1=h1.split('"', 1)[0] + + #GET SECONARY HEADLINES + #This comes from the a column or b column, above the break + h2=content + h2s=[] + #A column + h2=h2.split('<div class="a-column column">', 1)[1] + h2=h2.split('<!-- close a-column -->', 1)[0] + #remove "collection" sets + while '<div class="collection headlines">' in h2: + arr=h2.split('<div class="collection headlines">', 1) + h2=arr[0]+arr[1].split('</ul>', 1)[1] + #Grab the remaining URLs + while '<a href="' in h2: + h2=h2.split('<a href="', 1)[1] + x=h2.split('"', 1)[0] + if h1 not in x: + h2s.append(x) + + #B column + h2=content + h2=h2.split('<div class="b-column column">', 1)[1] + h2=h2.split('<!-- close b-column -->', 1)[0] + #remove "collection" sets + while '<div class="collection headlines">' in h2: + arr=h2.split('<div class="collection headlines">', 1) + h2=arr[0]+arr[1].split('</ul>', 1)[1] + #Grab the remaining URLs + while '<a href="' in h2: + h2=h2.split('<a href="', 1)[1] + x=h2.split('"', 1)[0] + if (h1 not in x) and (x not in h2s): + h2s.append(x) + + #REMOVE DUPLICATES + removeArr=[] + for i in range(len(h2s)): + for j in range(len(h2s)): + if i==j: + continue + else: + if h2s[i] in h2s[j]: + removeArr.append(h2s[j]) + for x in removeArr: + h2s.remove(x) + + + #GET TERTIARY HEADLINES + h3=content + h3s=[] + h3=h3.split('<!-- close lede-package-region -->', 1)[1] + h3=h3.split('<a href="https://www.nytimes.com/tips">', 1)[0] + #remove "collection" sets + while '<div class="collection headlines">' in h2: + arr=h3.split('<div class="collection headlines">', 1) + h3=arr[0]+arr[1].split('</ul>', 1)[1] + + #Grab the remaining URLs + while '<a href="' in h3: + h3=h3.split('<a href="', 1)[1] + x=h3.split('"', 1)[0] + if (h1 not in x) and (x not in h3s): + h3s.append(x) + #REMOVE DUPLICATES + removeArr=[] + for i in range(len(h3s)): + for j in range(len(h3s)): + if i==j: + continue + else: + if h3s[i] in h3s[j]: + removeArr.append(h3s[j]) + for x in removeArr: + h3s.remove(x) + + + #BUILD THE ARTICLES BASED ON URLS + h1Arr=[] + h1Arr.append(buildArticle(h1, 'New York Times')) + + h2Arr=[] + for x in h2s: + h2Arr.append(buildArticle(x, 'New York Times')) + + h3Arr=[] + for x in h3s: + h3Arr.append(buildArticle(x, 'New York Times')) + + nyt=NewsSource2('New York Times', 'http://nytimes.com', h1Arr, h2Arr, h3Arr) + + return nyt + + + + +''' +EXAMPLE OF BIG HEADLINE SPANNING BOTH A AND B COLUMNS + +<div class="span-ab-layout layout"> + + <div class="ab-column column"> + + <section id="top-news" class="top-news"> + <h2 class="section-heading visually-hidden">Top News</h2> + + <div class="above-banner-region region"> + + <div class="collection"> + <div class="hpHeader" id="top-megapackage-kicker"> + <h6><a href="http://www.nytimes.com/pages/politics/index.html?src=hpHeader">The 45th President</a></h6> +</div> + +</div> + + </div><!-- close above-banner-region --> + + <div class="span-ab-top-region region"> + + <div class="collection"> + <article class="story theme-summary banner" id="topnews-100000004932040" data-story-id="100000004932040" data-rank="0" data-collection-renderstyle="Banner"> + <h1 class="story-heading"><a href="https://www.nytimes.com/2017/02/14/us/politics/fbi-interviewed-mike-flynn.html">F.B.I. Questioned Flynn About Russia Call</a></h1> +</article> +</div> + + </div><!-- close span-ab-top-region --> +''' diff --git a/unbiasedFunctions.py b/unbiasedFunctions.py index c2f62c0..ef6ae7c 100644 --- a/unbiasedFunctions.py +++ b/unbiasedFunctions.py @@ -3,6 +3,7 @@ import os import random
import time
+
#take in a url and delimiters, return twitter card
def buildArticle(url, sourceName):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd):
@@ -44,41 +45,6 @@ def buildArticle(url, sourceName):#, titleDelStart, titleDelEnd, imgDelStart, im print("Article parsing error in buildArticle() for URL: "+url)
return None
-#do the hardcore HTML parsing
-def splitHTML(content, sectionDividerStart, sectionDividerEnd, delStart, delEnd):
- retArr=[]
-
- if sectionDividerStart!=None:
- content=content.split(sectionDividerStart)[1]
- if sectionDividerEnd!=None:
- content=content.split(sectionDividerEnd)[0]
- if delStart!=[]:
- while True:
- x=content
- for delim in delStart:
- if delim in content:
- x=content.split(delim)[1]
- x=x.split(delEnd)[0]
- if x not in retArr:
- retArr.append(x)
- content=content.split(delStart[0], 1)
- if(len(content)==1):
- break
- else:
- content=content[1:][0]
-
- return retArr
-
-
-
-'''
-**********************8
-
-Need to fix this function to use splitHTML() and actually loop through
-all of the links instead of just using the first one.
-
-************************
-'''
#take in a read main source file (e.g. from nytimes.com) and return lists of the urls for stories
def extractURLs(content, source):
@@ -202,6 +168,11 @@ def buildNewsSourceArr(sourceList): listLen=len(sourceList)
while i < listLen:
source=sourceList[i]
+
+ if type(source) is NewsSource2:
+ i+=1
+ continue
+
url=source.url
#download file
diff --git a/unbiasedObjects.py b/unbiasedObjects.py index b1f6ec5..2233b0c 100644 --- a/unbiasedObjects.py +++ b/unbiasedObjects.py @@ -15,6 +15,21 @@ class Article(): def __str__(self):
return '-----------\n'+self.title+'\n'+self.source+'\n'+self.description+'\n'+self.url+'\n'+self.img+'\n'+'-----------'
+
+class NewsSource2():
+ name=''
+ url=''
+ h1Arr=[]
+ h2Arr=[]
+ h3Arr=[]
+ def __init__(self, name, url, h1Arr, h2Arr, h3Arr):
+ self.name=name
+ self.url=url
+ self.h1Arr=h1Arr
+ self.h2Arr=h2Arr
+ self.h3Arr=h3Arr
+
+
class NewsSource():
name=''
|