diff options
author | sstvinc2 <sstvinc2@gmail.com> | 2017-02-14 21:02:29 -0600 |
---|---|---|
committer | sstvinc2 <sstvinc2@gmail.com> | 2017-02-14 21:02:29 -0600 |
commit | 7ceea6a5a495302ffdec9921ea9f841a2b6df8c2 (patch) | |
tree | 15df09ca79207e0c87a7460adbe1476b1627e634 /parser.py | |
parent | 82166863a0c4a8c101d041123c4ac2f098c9ef9a (diff) |
New parsing method started
Got NYT up and running with new object type and custom parser
Diffstat (limited to 'parser.py')
-rw-r--r-- | parser.py | 151 |
1 files changed, 151 insertions, 0 deletions
diff --git a/parser.py b/parser.py new file mode 100644 index 0000000..2020f55 --- /dev/null +++ b/parser.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python3 + +from unbiasedObjects import * +from unbiasedFunctions import buildArticle +import os + +def buildNYT(): + url='http://www.nytimes.com' + + #download file + os.system('wget -q -O scratch/temp1.html --no-check-certificate '+url) + + #read file + f=open('scratch/temp1.html', 'r')#, encoding="utf8") + content=f.read() + f.close() + + #get main headline + #this will likely need if/else logic + h1=content + + #This is with a large headline over a and b columns + h1=h1.split('story theme-summary banner', 1)[1] + h1=h1.split('<a href="', 1)[1] + h1=h1.split('"', 1)[0] + + #GET SECONARY HEADLINES + #This comes from the a column or b column, above the break + h2=content + h2s=[] + #A column + h2=h2.split('<div class="a-column column">', 1)[1] + h2=h2.split('<!-- close a-column -->', 1)[0] + #remove "collection" sets + while '<div class="collection headlines">' in h2: + arr=h2.split('<div class="collection headlines">', 1) + h2=arr[0]+arr[1].split('</ul>', 1)[1] + #Grab the remaining URLs + while '<a href="' in h2: + h2=h2.split('<a href="', 1)[1] + x=h2.split('"', 1)[0] + if h1 not in x: + h2s.append(x) + + #B column + h2=content + h2=h2.split('<div class="b-column column">', 1)[1] + h2=h2.split('<!-- close b-column -->', 1)[0] + #remove "collection" sets + while '<div class="collection headlines">' in h2: + arr=h2.split('<div class="collection headlines">', 1) + h2=arr[0]+arr[1].split('</ul>', 1)[1] + #Grab the remaining URLs + while '<a href="' in h2: + h2=h2.split('<a href="', 1)[1] + x=h2.split('"', 1)[0] + if (h1 not in x) and (x not in h2s): + h2s.append(x) + + #REMOVE DUPLICATES + removeArr=[] + for i in range(len(h2s)): + for j in range(len(h2s)): + if i==j: + continue + else: + if h2s[i] in h2s[j]: + removeArr.append(h2s[j]) + for x in removeArr: + h2s.remove(x) + + + #GET TERTIARY HEADLINES + h3=content + h3s=[] + h3=h3.split('<!-- close lede-package-region -->', 1)[1] + h3=h3.split('<a href="https://www.nytimes.com/tips">', 1)[0] + #remove "collection" sets + while '<div class="collection headlines">' in h2: + arr=h3.split('<div class="collection headlines">', 1) + h3=arr[0]+arr[1].split('</ul>', 1)[1] + + #Grab the remaining URLs + while '<a href="' in h3: + h3=h3.split('<a href="', 1)[1] + x=h3.split('"', 1)[0] + if (h1 not in x) and (x not in h3s): + h3s.append(x) + #REMOVE DUPLICATES + removeArr=[] + for i in range(len(h3s)): + for j in range(len(h3s)): + if i==j: + continue + else: + if h3s[i] in h3s[j]: + removeArr.append(h3s[j]) + for x in removeArr: + h3s.remove(x) + + + #BUILD THE ARTICLES BASED ON URLS + h1Arr=[] + h1Arr.append(buildArticle(h1, 'New York Times')) + + h2Arr=[] + for x in h2s: + h2Arr.append(buildArticle(x, 'New York Times')) + + h3Arr=[] + for x in h3s: + h3Arr.append(buildArticle(x, 'New York Times')) + + nyt=NewsSource2('New York Times', 'http://nytimes.com', h1Arr, h2Arr, h3Arr) + + return nyt + + + + +''' +EXAMPLE OF BIG HEADLINE SPANNING BOTH A AND B COLUMNS + +<div class="span-ab-layout layout"> + + <div class="ab-column column"> + + <section id="top-news" class="top-news"> + <h2 class="section-heading visually-hidden">Top News</h2> + + <div class="above-banner-region region"> + + <div class="collection"> + <div class="hpHeader" id="top-megapackage-kicker"> + <h6><a href="http://www.nytimes.com/pages/politics/index.html?src=hpHeader">The 45th President</a></h6> +</div> + +</div> + + </div><!-- close above-banner-region --> + + <div class="span-ab-top-region region"> + + <div class="collection"> + <article class="story theme-summary banner" id="topnews-100000004932040" data-story-id="100000004932040" data-rank="0" data-collection-renderstyle="Banner"> + <h1 class="story-heading"><a href="https://www.nytimes.com/2017/02/14/us/politics/fbi-interviewed-mike-flynn.html">F.B.I. Questioned Flynn About Russia Call</a></h1> +</article> +</div> + + </div><!-- close span-ab-top-region --> +''' |