New parsing method started

Got NYT up and running with new object type and custom parser
author: sstvinc2 <sstvinc2@gmail.com> 2017-02-14 21:02:29 -0600
committer: sstvinc2 <sstvinc2@gmail.com> 2017-02-14 21:02:29 -0600
commit: 7ceea6a5a495302ffdec9921ea9f841a2b6df8c2 (patch)
tree: 15df09ca79207e0c87a7460adbe1476b1627e634
parent: 82166863a0c4a8c101d041123c4ac2f098c9ef9a (diff)
5 files changed, 187 insertions, 39 deletions
diff --git a/html_template/unbiased.css b/html_template/unbiased.css
index b632cdd..86f653a 100644
--- a/html_template/unbiased.css
+++ b/html_template/unbiased.css
@@ -19,8 +19,8 @@ a:link, a:visited, a:hover, a:active {
     text-align:center;
     padding:.5em 0 1em;
     margin-bottom:1em;
-    border-bottom:1px solid #000;
-    background:#fdf;
+    border-bottom:3px solid #f00;
+    background:#44f;
 }
 
 .title{
@@ -29,15 +29,16 @@ a:link, a:visited, a:hover, a:active {
 
 #title-1{
     font-style:italic;
-    color:#d00;
+    color:#fff;
 }
 
 #title-2{
-    color:#00d;
+    color:#fff;
 }
 
 #subtitle{
     font-size:1.25em;
+    color:#ccc;
 }
 
 #timestamp{
@@ -59,6 +60,8 @@ a:link, a:visited, a:hover, a:active {
     margin-left:auto;
     margin-right:auto;
     text-align:center;
+    border-bottom: 3px solid #f00;
+    margin-bottom: 10px;
 }
 
 .top-story{
diff --git a/main.py b/main.py
index 163f73a..63fd908 100644
--- a/main.py
+++ b/main.py
@@ -2,6 +2,7 @@
 
 from unbiasedObjects import *
 from unbiasedFunctions import *
+from parser import *
 import time
 
 def main():
@@ -67,7 +68,10 @@ def run():
                                  None, None))
 
 
+    nyt=buildNYT()
+    sourceList.append(nyt)
 
+    '''
     sourceList.append(NewsSource('New York Times',
                                  'http://nytimes.com',
                                  ['<a href="'],
@@ -76,6 +80,10 @@ def run():
                                  '<div class="b-column column">', '<!-- close photo-spot-region -->',
                                  'section id="top-news" class="top-news"', '</div><!-- close a-column -->',
                                  'class="second-column-region region"', 'html.geo-dma-501 .nythpNYRegionPromo'))
+    '''
+
+
+
 
     sourceList.append(NewsSource('Fox News',
                                  'http://foxnews.com',
diff --git a/parser.py b/parser.py
new file mode 100644
index 0000000..2020f55
--- /dev/null
+++ b/parser.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+
+from unbiasedObjects import *
+from unbiasedFunctions import buildArticle
+import os
+
+def buildNYT():
+    url='http://www.nytimes.com'
+
+    #download file
+    os.system('wget -q -O scratch/temp1.html --no-check-certificate '+url)
+    
+    #read file
+    f=open('scratch/temp1.html', 'r')#, encoding="utf8")
+    content=f.read()
+    f.close()
+
+    #get main headline
+    #this will likely need if/else logic
+    h1=content
+
+    #This is with a large headline over a and b columns
+    h1=h1.split('story theme-summary banner', 1)[1]
+    h1=h1.split('<a href="', 1)[1]
+    h1=h1.split('"', 1)[0]
+
+    #GET SECONARY HEADLINES
+    #This comes from the a column or b column, above the break
+    h2=content
+    h2s=[]
+    #A column
+    h2=h2.split('<div class="a-column column">', 1)[1]
+    h2=h2.split('<!-- close a-column -->', 1)[0]
+    #remove "collection" sets
+    while '<div class="collection headlines">' in h2:
+        arr=h2.split('<div class="collection headlines">', 1)
+        h2=arr[0]+arr[1].split('</ul>', 1)[1]
+    #Grab the remaining URLs
+    while '<a href="' in h2:
+        h2=h2.split('<a href="', 1)[1]
+        x=h2.split('"', 1)[0]
+        if h1 not in x:
+            h2s.append(x)
+
+    #B column
+    h2=content
+    h2=h2.split('<div class="b-column column">', 1)[1]
+    h2=h2.split('<!-- close b-column -->', 1)[0]
+    #remove "collection" sets
+    while '<div class="collection headlines">' in h2:
+        arr=h2.split('<div class="collection headlines">', 1)
+        h2=arr[0]+arr[1].split('</ul>', 1)[1]
+    #Grab the remaining URLs
+    while '<a href="' in h2:
+        h2=h2.split('<a href="', 1)[1]
+        x=h2.split('"', 1)[0]
+        if (h1 not in x) and (x not in h2s):
+            h2s.append(x)
+
+    #REMOVE DUPLICATES
+    removeArr=[]
+    for i in range(len(h2s)):
+        for j in range(len(h2s)):
+            if i==j:
+                continue
+            else:
+                if h2s[i] in h2s[j]:
+                    removeArr.append(h2s[j])
+    for x in removeArr:
+        h2s.remove(x)
+
+
+    #GET TERTIARY HEADLINES
+    h3=content
+    h3s=[]
+    h3=h3.split('<!-- close lede-package-region -->', 1)[1]
+    h3=h3.split('<a href="https://www.nytimes.com/tips">', 1)[0]
+    #remove "collection" sets
+    while '<div class="collection headlines">' in h2:
+        arr=h3.split('<div class="collection headlines">', 1)
+        h3=arr[0]+arr[1].split('</ul>', 1)[1]
+    
+    #Grab the remaining URLs
+    while '<a href="' in h3:
+        h3=h3.split('<a href="', 1)[1]
+        x=h3.split('"', 1)[0]
+        if (h1 not in x) and (x not in h3s):
+            h3s.append(x)
+    #REMOVE DUPLICATES
+    removeArr=[]
+    for i in range(len(h3s)):
+        for j in range(len(h3s)):
+            if i==j:
+                continue
+            else:
+                if h3s[i] in h3s[j]:
+                    removeArr.append(h3s[j])
+    for x in removeArr:
+        h3s.remove(x)
+
+
+    #BUILD THE ARTICLES BASED ON URLS
+    h1Arr=[]
+    h1Arr.append(buildArticle(h1, 'New York Times'))
+
+    h2Arr=[]
+    for x in h2s:
+        h2Arr.append(buildArticle(x, 'New York Times'))
+
+    h3Arr=[]
+    for x in h3s:
+        h3Arr.append(buildArticle(x, 'New York Times'))
+
+    nyt=NewsSource2('New York Times', 'http://nytimes.com', h1Arr, h2Arr, h3Arr)
+
+    return nyt
+
+
+
+
+'''
+EXAMPLE OF BIG HEADLINE SPANNING BOTH A AND B COLUMNS
+
+<div class="span-ab-layout layout">
+
+    <div class="ab-column column">
+
+        <section id="top-news" class="top-news">
+            <h2 class="section-heading visually-hidden">Top News</h2>
+
+                            <div class="above-banner-region region">
+
+                    <div class="collection">
+            <div class="hpHeader" id="top-megapackage-kicker">
+  <h6><a href="http://www.nytimes.com/pages/politics/index.html?src=hpHeader">The 45th President</a></h6>
+</div>
+
+</div>
+
+                </div><!-- close above-banner-region -->
+            
+                            <div class="span-ab-top-region region">
+
+                    <div class="collection">
+            <article class="story theme-summary banner" id="topnews-100000004932040" data-story-id="100000004932040" data-rank="0" data-collection-renderstyle="Banner">
+            <h1 class="story-heading"><a href="https://www.nytimes.com/2017/02/14/us/politics/fbi-interviewed-mike-flynn.html">F.B.I. Questioned Flynn About Russia Call</a></h1>
+</article>
+</div>
+
+                </div><!-- close span-ab-top-region -->
+'''
diff --git a/unbiasedFunctions.py b/unbiasedFunctions.py
index c2f62c0..ef6ae7c 100644
--- a/unbiasedFunctions.py
+++ b/unbiasedFunctions.py
@@ -3,6 +3,7 @@ import os
 import random
 import time
 
+
 #take in a url and delimiters, return twitter card
 def buildArticle(url, sourceName):#, titleDelStart, titleDelEnd, imgDelStart, imgDelEnd):
 
@@ -44,41 +45,6 @@ def buildArticle(url, sourceName):#, titleDelStart, titleDelEnd, imgDelStart, im
         print("Article parsing error in buildArticle() for URL: "+url)
         return None
 
-#do the hardcore HTML parsing
-def splitHTML(content, sectionDividerStart, sectionDividerEnd, delStart, delEnd):
-    retArr=[]
-    
-    if sectionDividerStart!=None:
-        content=content.split(sectionDividerStart)[1]
-    if sectionDividerEnd!=None:
-        content=content.split(sectionDividerEnd)[0]
-    if delStart!=[]:
-        while True:
-            x=content
-            for delim in delStart:
-                if delim in content:
-                    x=content.split(delim)[1]
-            x=x.split(delEnd)[0]
-            if x not in retArr:
-                retArr.append(x)   
-            content=content.split(delStart[0], 1)
-            if(len(content)==1):
-                break
-            else:
-                content=content[1:][0]
-
-    return retArr
-    
-
-
-'''
-**********************8
-
-Need to fix this function to use splitHTML() and actually loop through
-all of the links instead of just using the first one.
-
-************************
-'''
 
 #take in a read main source file (e.g. from nytimes.com) and return lists of the urls for stories
 def extractURLs(content, source):
@@ -202,6 +168,11 @@ def buildNewsSourceArr(sourceList):
     listLen=len(sourceList)
     while i < listLen:
         source=sourceList[i]
+
+        if type(source) is NewsSource2:
+            i+=1
+            continue
+
         url=source.url
 
         #download file
diff --git a/unbiasedObjects.py b/unbiasedObjects.py
index b1f6ec5..2233b0c 100644
--- a/unbiasedObjects.py
+++ b/unbiasedObjects.py
@@ -15,6 +15,21 @@ class Article():
     def __str__(self):
         return '-----------\n'+self.title+'\n'+self.source+'\n'+self.description+'\n'+self.url+'\n'+self.img+'\n'+'-----------'
 
+
+class NewsSource2():
+    name=''
+    url=''
+    h1Arr=[]
+    h2Arr=[]
+    h3Arr=[]
+    def __init__(self, name, url, h1Arr, h2Arr, h3Arr):
+        self.name=name
+        self.url=url
+        self.h1Arr=h1Arr
+        self.h2Arr=h2Arr
+        self.h3Arr=h3Arr
+        
+
         
 class NewsSource():
     name=''
author	sstvinc2 <sstvinc2@gmail.com>	2017-02-14 21:02:29 -0600
committer	sstvinc2 <sstvinc2@gmail.com>	2017-02-14 21:02:29 -0600
commit	7ceea6a5a495302ffdec9921ea9f841a2b6df8c2 (patch)
tree	15df09ca79207e0c87a7460adbe1476b1627e634
parent	82166863a0c4a8c101d041123c4ac2f098c9ef9a (diff)