use dateutil to parse rfc3339 datetime strings in <time> elements, if they are present.

This commit is contained in:
Adrian Malacoda 2016-11-27 01:10:04 -06:00
parent c800312423
commit 3f4eecc238

View File

@ -1,7 +1,11 @@
from ..model import User, Category, Forum, Board, Post, Thread from ..model import User, Category, Forum, Board, Post, Thread
from urllib.parse import urlparse from urllib.parse import urlparse
from time import strptime, mktime
import dateutil.parser
from pyquery import PyQuery as pq from pyquery import PyQuery as pq
time_format = "%b %d %y %I:%M %p"
def can_scrape_url (url): def can_scrape_url (url):
return ".fr.yuku.com" in url return ".fr.yuku.com" in url
@ -76,6 +80,11 @@ def scrape_thread (url):
else: else:
signature = None signature = None
if date_element.find("time"):
timestamp = dateutil.parser.parse(date_element.text()).timestamp()
else:
timestamp = mktime(strptime(date_element.text(), time_format))
thread.children.append(Post( thread.children.append(Post(
author=User( author=User(
name=user_header.find("p > a").eq(0).text(), name=user_header.find("p > a").eq(0).text(),
@ -84,7 +93,7 @@ def scrape_thread (url):
subtitle=user_header.find(".custom_title").text(), subtitle=user_header.find(".custom_title").text(),
signature=signature signature=signature
), ),
timestamp=date_element.text(), timestamp=timestamp,
body=post_content_container.html().strip() body=post_content_container.html().strip()
)) ))