summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorsstvinc2 <sstvinc2@gmail.com>2017-02-14 21:02:29 -0600
committersstvinc2 <sstvinc2@gmail.com>2017-02-14 21:02:29 -0600
commit7ceea6a5a495302ffdec9921ea9f841a2b6df8c2 (patch)
tree15df09ca79207e0c87a7460adbe1476b1627e634
parent82166863a0c4a8c101d041123c4ac2f098c9ef9a (diff)
New parsing method started
Got NYT up and running with new object type and custom parser
-rw-r--r--html_template/unbiased.css11
-rw-r--r--main.py8
-rw-r--r--parser.py151
-rw-r--r--unbiasedFunctions.py41
-rw-r--r--unbiasedObjects.py15
5 files changed, 187 insertions, 39 deletions
diff --git a/html_template/unbiased.css b/html_template/unbiased.css
index b632cdd..86f653a 100644
--- a/html_template/unbiased.css
+++ b/html_template/unbiased.css
@@ -19,8 +19,8 @@ a:link, a:visited, a:hover, a:active {
text-align:center;
padding:.5em 0 1em;
margin-bottom:1em;
- border-bottom:1px solid #000;
- background:#fdf;
+ border-bottom:3px solid #f00;
+ background:#44f;
}
.title{
@@ -29,15 +29,16 @@ a:link, a:visited, a:hover, a:active {
#title-1{
font-style:italic;
- color:#d00;
+ color:#fff;
}
#title-2{
- color:#00d;
+ color:#fff;
}
#subtitle{
font-size:1.25em;
+ color:#ccc;
}
#timestamp{
@@ -59,6 +60,8 @@ a:link, a:visited, a:hover, a:active {
margin-left:auto;
margin-right:auto;
text-align:center;
+ border-bottom: 3px solid #f00;
+ margin-bottom: 10px;
}
.top-story{
diff --git a/main.py b/main.py
index 163f73a..63fd908 100644
--- a/main.py
+++ b/main.py
@@ -2,6 +2,7 @@
from unbiasedObjects import *
from unbiasedFunctions import *
+from parser import *
import time
def main():
@@ -67,7 +68,10 @@ def run():
None, None))
+ nyt=buildNYT()
+ sourceList.append(nyt)
+ '''
sourceList.append(NewsSource('New York Times',
'http://nytimes.com',
['<a href="'],
@@ -76,6 +80,10 @@ def run():
'<div class="b-column column">', '<!-- close photo-spot-region -->',
'section id="top-news" class="top-news"', '</div><!-- close a-column -->',
'class="second-column-region region"', 'html.geo-dma-501 .nythpNYRegionPromo'))
+ '''
+
+
+
sourceList.append(NewsSource('Fox News',
'http://foxnews.com',
diff --git a/parser.py b/parser.py
new file mode 100644
index 0000000..2020f55
--- /dev/null
+++ b/parser.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+
+from unbiasedObjects import *
+from unbiasedFunctions import buildArticle
+import os
+
+def buildNYT():
+ url='http://www.nytimes.com'
+
+ #download file
+ os.system('wget -q -O scratch/temp1.html --no-check-certificate '+url)
+
+ #read file
+ f=open('scratch/temp1.html', 'r')#, encoding="utf8")
+ content=f.read()
+ f.close()
+
+ #get main headline
+ #this will likely need if/else logic
+ h1=content
+
+ #This is with a large headline over a and b columns
+ h1=h1.split('story theme-summary banner', 1)[1]
+ h1=h1.split('<a href="', 1)[1]
+ h1=h1.split('"', 1)[0]
+
+ #GET SECONARY HEADLINES
+ #This comes from the a column or b column, above the break
+ h2=content
+ h2s=[]
+ #A column
+ h2=h2.split('<div class="a-column column">', 1)[1]
+ h2=h2.split('<!-- close a-column -->', 1)[0]
+ #remove "collection" sets
+ while '<div class="collection headlines">' in h2:
+ arr=h2.split('<div class="collection headlines">', 1)
+ h2=arr[0]+arr[1].split('</ul>', 1)[1]
+ #Grab the remaining URLs
+ while '<a href="' in h2:
+ h2=h2.split('<a href="', 1)[1]
+ x=h2.split('"', 1)[0]
+ if h1 not in x:
+ h2s.append(x)
+
+ #B column
+ h2=content
+ h2=h2.split('<div class="b-column column">', 1)[1]
+ h2=h2.split('<!-- close b-column -->', 1)[0]
+ #remove "collection" sets
+ while '<div class="collection headlines">' in h2:
+ arr=h2.split('<div class="collection headlines">', 1)
+ h2=arr[0]+arr[1].split('</ul>', 1)[1]
+ #Grab the remaining URLs
+ while '<a href="' in h2:
+ h2=h2.split('<a href="', 1)[1]
+ x=h2.split('"', 1)[0]
+ if (h1 not in x) and (x not in h2s):
+ h2s.append(x)
+
+ #REMOVE DUPLICATES
+ removeArr=[]
+ for i in range(len(h2s)):
+ for j in range(len(h2s)):
+ if i==j:
+ continue
+ else:
+ if h2s[i] in h2s[j]:
+ removeArr.append(h2s[j])
+ for x in removeArr:
+ h2s.remove(x)
+
+
+ #GET TERTIARY HEADLINES
+ h3=content
+ h3s=[]
+ h3=h3.split('<!-- close lede-package-region -->', 1)[1]
+ h3=h3.split('<a href="https://www.nytimes.com/tips">', 1)[0]
+ #remove "collection" sets
+ while '<div class="collection headlines">' in h2:
+ arr=h3.split('<div class="collection headlines">', 1)
+ h3=arr[0]+arr[1].split('</ul>', 1)[1]
+
+ #Grab the remaining URLs
+ while '<a href="' in h3:
+ h3=h3.split('<a href="', 1)[1]
+ x=h3.split('"', 1)[0]
+ if (h1 not in x) and (x not in h3s):
+ h3s.append(x)
+ #REMOVE DUPLICATES
+ removeArr=[]
+ for i in range(len(h3s)):
+ for j in range(len(h3s)):
+ if i==j:
+ continue
+ else:
+ if h3s[i] in h3s[j]:
+ removeArr.append(h3s[j])
+ for x in removeArr:
+ h3s.remove(x)
+
+
+ #BUILD THE ARTICLES BASED ON URLS
+ h1Arr=[]
+ h1Arr.append(buildArticle(h1, 'New York Times'))
+
+ h2Arr=[]
+ for x in h2s:
+ h2Arr.append(buildArticle(x, 'New York Times'))
+
+ h3Arr=[]
+ for x in h3s:
+ h3Arr.append(buildArticle(x, 'New York Times'))
+
+ nyt=NewsSource2('New York Times', 'http://nytimes.com', h1Arr, h2Arr, h3Arr)
+
+ return nyt
+
+
+
+
+'''
+EXAMPLE OF BIG HEADLINE SPANNING BOTH A AND B COLUMNS
+
+<div class="span-ab-layout layout">
+
+ <div class="ab-column column">
+
+ <section id="top-news" class="top-news">
+ <h2 class="section-heading visually-hidden">Top News</h2>
+
+ <div class="above-banner-region region">
+
+ <div class="collection">
+ <div class="hpHeader" id="top-megapackage-kicker">
+ <h6><a href="http://www.nytimes.com/pages/politics/index.html?src=hpHeader">The 45th President</a></h6>
+</div>
+
+</div>
+
+ </div><!-- close above-banner-region -->
+
+ <div class="span-ab-top-region region">
+
+ <div class="collection">
+ <article class="story theme-summary banner" id="topnews-100000004932040" data-story-id="100000004932040" data-rank="0" data-collection-renderstyle="Banner">
+ <h1 class="story-heading"><a href="https://www.nytimes.com/2017/02/14/us/politics/fbi-interviewed-mike-flynn.html">F.B.I. Questioned Flynn About Russia Call</a></h1>
+</article>
+</div>
+
+ </div><!-- close span-ab-top-region -->
+'''
diff --git a/unbiasedFunctions.py b/unbiasedFunctions.py
index c2f62c0..ef6ae7c 100644
--- a/unbiasedFunctions.py
+++ b/unbiasedFunctions.py
@@ -3,6 +3,7 @@ import os
import random
import time
+
#take in a url and delimiters, return twitter card
def buildArticle(url, sourceName):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd):
@@ -44,41 +45,6 @@ def buildArticle(url, sourceName):#, titleDelStart, titleDelEnd, imgDelStart, im
print("Article parsing error in buildArticle() for URL: "+url)
return None
-#do the hardcore HTML parsing
-def splitHTML(content, sectionDividerStart, sectionDividerEnd, delStart, delEnd):
- retArr=[]
-
- if sectionDividerStart!=None:
- content=content.split(sectionDividerStart)[1]
- if sectionDividerEnd!=None:
- content=content.split(sectionDividerEnd)[0]
- if delStart!=[]:
- while True:
- x=content
- for delim in delStart:
- if delim in content:
- x=content.split(delim)[1]
- x=x.split(delEnd)[0]
- if x not in retArr:
- retArr.append(x)
- content=content.split(delStart[0], 1)
- if(len(content)==1):
- break
- else:
- content=content[1:][0]
-
- return retArr
-
-
-
-'''
-**********************8
-
-Need to fix this function to use splitHTML() and actually loop through
-all of the links instead of just using the first one.
-
-************************
-'''
#take in a read main source file (e.g. from nytimes.com) and return lists of the urls for stories
def extractURLs(content, source):
@@ -202,6 +168,11 @@ def buildNewsSourceArr(sourceList):
listLen=len(sourceList)
while i < listLen:
source=sourceList[i]
+
+ if type(source) is NewsSource2:
+ i+=1
+ continue
+
url=source.url
#download file
diff --git a/unbiasedObjects.py b/unbiasedObjects.py
index b1f6ec5..2233b0c 100644
--- a/unbiasedObjects.py
+++ b/unbiasedObjects.py
@@ -15,6 +15,21 @@ class Article():
def __str__(self):
return '-----------\n'+self.title+'\n'+self.source+'\n'+self.description+'\n'+self.url+'\n'+self.img+'\n'+'-----------'
+
+class NewsSource2():
+ name=''
+ url=''
+ h1Arr=[]
+ h2Arr=[]
+ h3Arr=[]
+ def __init__(self, name, url, h1Arr, h2Arr, h3Arr):
+ self.name=name
+ self.url=url
+ self.h1Arr=h1Arr
+ self.h2Arr=h2Arr
+ self.h3Arr=h3Arr
+
+
class NewsSource():
name=''