New parsing method started

Got NYT up and running with new object type and custom parser
author: sstvinc2 <sstvinc2@gmail.com> 2017-02-14 21:02:29 -0600
committer: sstvinc2 <sstvinc2@gmail.com> 2017-02-14 21:02:29 -0600
commit: 7ceea6a5a495302ffdec9921ea9f841a2b6df8c2 (patch)
tree: 15df09ca79207e0c87a7460adbe1476b1627e634 /parser.py
parent: 82166863a0c4a8c101d041123c4ac2f098c9ef9a (diff)
1 files changed, 151 insertions, 0 deletions
diff --git a/parser.py b/parser.py
new file mode 100644
index 0000000..2020f55
--- /dev/null
+++ b/parser.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+
+from unbiasedObjects import *
+from unbiasedFunctions import buildArticle
+import os
+
+def buildNYT():
+    url='http://www.nytimes.com'
+
+    #download file
+    os.system('wget -q -O scratch/temp1.html --no-check-certificate '+url)
+    
+    #read file
+    f=open('scratch/temp1.html', 'r')#, encoding="utf8")
+    content=f.read()
+    f.close()
+
+    #get main headline
+    #this will likely need if/else logic
+    h1=content
+
+    #This is with a large headline over a and b columns
+    h1=h1.split('story theme-summary banner', 1)[1]
+    h1=h1.split('<a href="', 1)[1]
+    h1=h1.split('"', 1)[0]
+
+    #GET SECONARY HEADLINES
+    #This comes from the a column or b column, above the break
+    h2=content
+    h2s=[]
+    #A column
+    h2=h2.split('<div class="a-column column">', 1)[1]
+    h2=h2.split('<!-- close a-column -->', 1)[0]
+    #remove "collection" sets
+    while '<div class="collection headlines">' in h2:
+        arr=h2.split('<div class="collection headlines">', 1)
+        h2=arr[0]+arr[1].split('</ul>', 1)[1]
+    #Grab the remaining URLs
+    while '<a href="' in h2:
+        h2=h2.split('<a href="', 1)[1]
+        x=h2.split('"', 1)[0]
+        if h1 not in x:
+            h2s.append(x)
+
+    #B column
+    h2=content
+    h2=h2.split('<div class="b-column column">', 1)[1]
+    h2=h2.split('<!-- close b-column -->', 1)[0]
+    #remove "collection" sets
+    while '<div class="collection headlines">' in h2:
+        arr=h2.split('<div class="collection headlines">', 1)
+        h2=arr[0]+arr[1].split('</ul>', 1)[1]
+    #Grab the remaining URLs
+    while '<a href="' in h2:
+        h2=h2.split('<a href="', 1)[1]
+        x=h2.split('"', 1)[0]
+        if (h1 not in x) and (x not in h2s):
+            h2s.append(x)
+
+    #REMOVE DUPLICATES
+    removeArr=[]
+    for i in range(len(h2s)):
+        for j in range(len(h2s)):
+            if i==j:
+                continue
+            else:
+                if h2s[i] in h2s[j]:
+                    removeArr.append(h2s[j])
+    for x in removeArr:
+        h2s.remove(x)
+
+
+    #GET TERTIARY HEADLINES
+    h3=content
+    h3s=[]
+    h3=h3.split('<!-- close lede-package-region -->', 1)[1]
+    h3=h3.split('<a href="https://www.nytimes.com/tips">', 1)[0]
+    #remove "collection" sets
+    while '<div class="collection headlines">' in h2:
+        arr=h3.split('<div class="collection headlines">', 1)
+        h3=arr[0]+arr[1].split('</ul>', 1)[1]
+    
+    #Grab the remaining URLs
+    while '<a href="' in h3:
+        h3=h3.split('<a href="', 1)[1]
+        x=h3.split('"', 1)[0]
+        if (h1 not in x) and (x not in h3s):
+            h3s.append(x)
+    #REMOVE DUPLICATES
+    removeArr=[]
+    for i in range(len(h3s)):
+        for j in range(len(h3s)):
+            if i==j:
+                continue
+            else:
+                if h3s[i] in h3s[j]:
+                    removeArr.append(h3s[j])
+    for x in removeArr:
+        h3s.remove(x)
+
+
+    #BUILD THE ARTICLES BASED ON URLS
+    h1Arr=[]
+    h1Arr.append(buildArticle(h1, 'New York Times'))
+
+    h2Arr=[]
+    for x in h2s:
+        h2Arr.append(buildArticle(x, 'New York Times'))
+
+    h3Arr=[]
+    for x in h3s:
+        h3Arr.append(buildArticle(x, 'New York Times'))
+
+    nyt=NewsSource2('New York Times', 'http://nytimes.com', h1Arr, h2Arr, h3Arr)
+
+    return nyt
+
+
+
+
+'''
+EXAMPLE OF BIG HEADLINE SPANNING BOTH A AND B COLUMNS
+
+<div class="span-ab-layout layout">
+
+    <div class="ab-column column">
+
+        <section id="top-news" class="top-news">
+            <h2 class="section-heading visually-hidden">Top News</h2>
+
+                            <div class="above-banner-region region">
+
+                    <div class="collection">
+            <div class="hpHeader" id="top-megapackage-kicker">
+  <h6><a href="http://www.nytimes.com/pages/politics/index.html?src=hpHeader">The 45th President</a></h6>
+</div>
+
+</div>
+
+                </div><!-- close above-banner-region -->
+            
+                            <div class="span-ab-top-region region">
+
+                    <div class="collection">
+            <article class="story theme-summary banner" id="topnews-100000004932040" data-story-id="100000004932040" data-rank="0" data-collection-renderstyle="Banner">
+            <h1 class="story-heading"><a href="https://www.nytimes.com/2017/02/14/us/politics/fbi-interviewed-mike-flynn.html">F.B.I. Questioned Flynn About Russia Call</a></h1>
+</article>
+</div>
+
+                </div><!-- close span-ab-top-region -->
+'''
author	sstvinc2 <sstvinc2@gmail.com>	2017-02-14 21:02:29 -0600
committer	sstvinc2 <sstvinc2@gmail.com>	2017-02-14 21:02:29 -0600
commit	7ceea6a5a495302ffdec9921ea9f841a2b6df8c2 (patch)
tree	15df09ca79207e0c87a7460adbe1476b1627e634 /parser.py
parent	82166863a0c4a8c101d041123c4ac2f098c9ef9a (diff)