use dateutil to parse rfc3339 datetime strings in <time> elements, if they are present.
This commit is contained in:
parent
c800312423
commit
3f4eecc238
@ -1,7 +1,11 @@
|
||||
from ..model import User, Category, Forum, Board, Post, Thread
|
||||
from urllib.parse import urlparse
|
||||
from time import strptime, mktime
|
||||
import dateutil.parser
|
||||
from pyquery import PyQuery as pq
|
||||
|
||||
time_format = "%b %d %y %I:%M %p"
|
||||
|
||||
def can_scrape_url (url):
|
||||
return ".fr.yuku.com" in url
|
||||
|
||||
@ -76,6 +80,11 @@ def scrape_thread (url):
|
||||
else:
|
||||
signature = None
|
||||
|
||||
if date_element.find("time"):
|
||||
timestamp = dateutil.parser.parse(date_element.text()).timestamp()
|
||||
else:
|
||||
timestamp = mktime(strptime(date_element.text(), time_format))
|
||||
|
||||
thread.children.append(Post(
|
||||
author=User(
|
||||
name=user_header.find("p > a").eq(0).text(),
|
||||
@ -84,7 +93,7 @@ def scrape_thread (url):
|
||||
subtitle=user_header.find(".custom_title").text(),
|
||||
signature=signature
|
||||
),
|
||||
timestamp=date_element.text(),
|
||||
timestamp=timestamp,
|
||||
body=post_content_container.html().strip()
|
||||
))
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user