python/atoma/utils.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84

from datetime import datetime, timezone
from xml.etree.ElementTree import Element
from typing import Optional

import dateutil.parser
from defusedxml.ElementTree import parse as defused_xml_parse, ParseError

from .exceptions import FeedXMLError, FeedParseError

ns = {
    'content': 'http://purl.org/rss/1.0/modules/content/',
    'feed': 'http://www.w3.org/2005/Atom'
}


def parse_xml(xml_content):
    try:
        return defused_xml_parse(xml_content)
    except ParseError:
        raise FeedXMLError('Not a valid XML document')


def get_child(element: Element, name,
              optional: bool=True) -> Optional[Element]:
    child = element.find(name, namespaces=ns)

    if child is None and not optional:
        raise FeedParseError(
            'Could not parse feed: "{}" does not have a "{}"'
            .format(element.tag, name)
        )

    elif child is None:
        return None

    return child


def get_text(element: Element, name, optional: bool=True) -> Optional[str]:
    child = get_child(element, name, optional)
    if child is None:
        return None

    if child.text is None:
        if optional:
            return None

        raise FeedParseError(
            'Could not parse feed: "{}" text is required but is empty'
            .format(name)
        )

    return child.text.strip()


def get_int(element: Element, name, optional: bool=True) -> Optional[int]:
    text = get_text(element, name, optional)
    if text is None:
        return None

    return int(text)


def get_datetime(element: Element, name,
                 optional: bool=True) -> Optional[datetime]:
    text = get_text(element, name, optional)
    if text is None:
        return None

    return try_parse_date(text)


def try_parse_date(date_str: str) -> Optional[datetime]:
    try:
        date = dateutil.parser.parse(date_str, fuzzy=True)
    except (ValueError, OverflowError):
        return None

    if date.tzinfo is None:
        # TZ naive datetime, make it a TZ aware datetime by assuming it
        # contains UTC time
        date = date.replace(tzinfo=timezone.utc)

    return date