From 924e6e0ece7ef9e85cfe761c5383a54000dad2f7 Mon Sep 17 00:00:00 2001
From: Matt Singleton <matt@xcolour.net>
Date: Sun, 3 Sep 2017 14:10:31 -0400
Subject: rewrite fox parser to use beautifulsoup

---
 unbiased/parser.py | 50 +++++++++++++++++++++-----------------------------
 1 file changed, 21 insertions(+), 29 deletions(-)
diff --git a/unbiased/parser.py b/unbiased/parser.py
index 7d2d788..e3344f4 100755
--- a/unbiased/parser.py
+++ b/unbiased/parser.py
@@ -5,6 +5,7 @@ import os
 import re
 import urllib.parse
 
+from bs4 import BeautifulSoup
 import requests
 
 from unbiased.unbiasedObjects import *
@@ -818,41 +819,32 @@ def buildABC():
 
 
 def buildFoxNews():
-    url='http://foxnews.com'
-    name='Fox News'
-
-    #DOWNLOAD HOMEPAGE CONTENT
-    content=urlToContent(url)
-    
-    #get main headline
-    h1=content
-    h1=h1.split('<h1><a href="', 1)[1]
-    h1=h1.split('"', 1)[0]
-    h1s=[h1]
+    url = 'http://foxnews.com'
+    name = 'Fox News'
+
+    # DOWNLOAD HOMEPAGE CONTENT
+    content = urlToContent(url)
+    soup = BeautifulSoup(content, 'lxml')
+
+    # get main headline
+    h1 = soup.find('div', id='big-top')\
+             .find('div', class_='primary')\
+             .find('h1')\
+             .find('a')
+    h1 = h1['href']
+    h1s = [h1]
     h1s = ['http:' + x if x.startswith('//') else x for x in h1s]
 
     #GET SECONDARY HEADLINES
-    h2=content
-    h2s=[]
-    h2=h2.split('<div class="top-stories">', 1)[1]
-    h2=h2.split('<section id="latest"', 1)[0]
-    while '<li data-vr-contentbox=""><a href="' in h2:
-        h2=h2.split('<li data-vr-contentbox=""><a href="', 1)[1]
-        x=h2.split('"', 1)[0]
-        if h1 not in x:
-            h2s.append(x)
+    h2s = soup.find('div', id='big-top').find('div', class_='top-stories').select('li > a')
+    h2s = [x['href'] for x in h2s]
     h2s = ['http:' + x if x.startswith('//') else x for x in h2s]
 
     #GET TERTIARY HEADLINES
-    h3=content
-    h3s=[]
-    h3=h3.split('div id="big-top"', 1)[1]
-    h3=h3.split('<div class="top-stories">', 1)[0]
-    while '<a href="' in h3:
-        h3=h3.split('<a href="', 1)[1]
-        x=h3.split('"', 1)[0]
-        if h1 not in x:
-            h3s.append(x)
+    h3s = []
+    for ul in soup.find('section', id='latest').find_all('ul', recursive=False):
+        for li in ul.find_all('li', recursive=False):
+            h3s.append(li.find('a')['href'])
     h3s = ['http:' + x if x.startswith('//') else x for x in h3s]
 
     h1s, h2s, h3s = removeDuplicates(h1s, h2s, h3s)
-- 
cgit v1.2.3