use dateutil to parse rfc3339 datetime strings in <time> elements, if they are present.

This commit is contained in:
Adrian Malacoda 2016-11-27 01:10:04 -06:00
parent c800312423
commit 3f4eecc238

View File

@ -1,7 +1,11 @@
from ..model import User, Category, Forum, Board, Post, Thread
from urllib.parse import urlparse
from time import strptime, mktime
import dateutil.parser
from pyquery import PyQuery as pq
time_format = "%b %d %y %I:%M %p"
def can_scrape_url (url):
return ".fr.yuku.com" in url
@ -76,6 +80,11 @@ def scrape_thread (url):
else:
signature = None
if date_element.find("time"):
timestamp = dateutil.parser.parse(date_element.text()).timestamp()
else:
timestamp = mktime(strptime(date_element.text(), time_format))
thread.children.append(Post(
author=User(
name=user_header.find("p > a").eq(0).text(),
@ -84,7 +93,7 @@ def scrape_thread (url):
subtitle=user_header.find(".custom_title").text(),
signature=signature
),
timestamp=date_element.text(),
timestamp=timestamp,
body=post_content_container.html().strip()
))