use dateutil to parse rfc3339 datetime strings in <time> elements, if they are present.
This commit is contained in:
parent
c800312423
commit
3f4eecc238
@ -1,7 +1,11 @@
|
|||||||
from ..model import User, Category, Forum, Board, Post, Thread
|
from ..model import User, Category, Forum, Board, Post, Thread
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
from time import strptime, mktime
|
||||||
|
import dateutil.parser
|
||||||
from pyquery import PyQuery as pq
|
from pyquery import PyQuery as pq
|
||||||
|
|
||||||
|
time_format = "%b %d %y %I:%M %p"
|
||||||
|
|
||||||
def can_scrape_url (url):
|
def can_scrape_url (url):
|
||||||
return ".fr.yuku.com" in url
|
return ".fr.yuku.com" in url
|
||||||
|
|
||||||
@ -76,6 +80,11 @@ def scrape_thread (url):
|
|||||||
else:
|
else:
|
||||||
signature = None
|
signature = None
|
||||||
|
|
||||||
|
if date_element.find("time"):
|
||||||
|
timestamp = dateutil.parser.parse(date_element.text()).timestamp()
|
||||||
|
else:
|
||||||
|
timestamp = mktime(strptime(date_element.text(), time_format))
|
||||||
|
|
||||||
thread.children.append(Post(
|
thread.children.append(Post(
|
||||||
author=User(
|
author=User(
|
||||||
name=user_header.find("p > a").eq(0).text(),
|
name=user_header.find("p > a").eq(0).text(),
|
||||||
@ -84,7 +93,7 @@ def scrape_thread (url):
|
|||||||
subtitle=user_header.find(".custom_title").text(),
|
subtitle=user_header.find(".custom_title").text(),
|
||||||
signature=signature
|
signature=signature
|
||||||
),
|
),
|
||||||
timestamp=date_element.text(),
|
timestamp=timestamp,
|
||||||
body=post_content_container.html().strip()
|
body=post_content_container.html().strip()
|
||||||
))
|
))
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user