diff options
author | James Taylor <user234683@users.noreply.github.com> | 2019-09-06 15:45:01 -0700 |
---|---|---|
committer | James Taylor <user234683@users.noreply.github.com> | 2019-09-06 15:45:01 -0700 |
commit | ac32b24b2a011292b704a3f27e8fd08a7ae9424b (patch) | |
tree | 0d6e021519dee62089733e20880c65cdb85d8841 /python/atoma/atom.py | |
parent | 7a93acabb3f5a8dd95ec0d56ae57cc34eb57c1b8 (diff) | |
parent | c393031ac54af959561214c8b1d6b22647a81b89 (diff) | |
download | yt-local-ac32b24b2a011292b704a3f27e8fd08a7ae9424b.tar.lz yt-local-ac32b24b2a011292b704a3f27e8fd08a7ae9424b.tar.xz yt-local-ac32b24b2a011292b704a3f27e8fd08a7ae9424b.zip |
Merge subscriptions into master
Diffstat (limited to 'python/atoma/atom.py')
-rw-r--r-- | python/atoma/atom.py | 284 |
1 files changed, 284 insertions, 0 deletions
diff --git a/python/atoma/atom.py b/python/atoma/atom.py new file mode 100644 index 0000000..d4e676c --- /dev/null +++ b/python/atoma/atom.py @@ -0,0 +1,284 @@ +from datetime import datetime +import enum +from io import BytesIO +from typing import Optional, List +from xml.etree.ElementTree import Element + +import attr + +from .utils import ( + parse_xml, get_child, get_text, get_datetime, FeedParseError, ns +) + + +class AtomTextType(enum.Enum): + text = "text" + html = "html" + xhtml = "xhtml" + + +@attr.s +class AtomTextConstruct: + text_type: str = attr.ib() + lang: Optional[str] = attr.ib() + value: str = attr.ib() + + +@attr.s +class AtomEntry: + title: AtomTextConstruct = attr.ib() + id_: str = attr.ib() + + # Should be mandatory but many feeds use published instead + updated: Optional[datetime] = attr.ib() + + authors: List['AtomPerson'] = attr.ib() + contributors: List['AtomPerson'] = attr.ib() + links: List['AtomLink'] = attr.ib() + categories: List['AtomCategory'] = attr.ib() + published: Optional[datetime] = attr.ib() + rights: Optional[AtomTextConstruct] = attr.ib() + summary: Optional[AtomTextConstruct] = attr.ib() + content: Optional[AtomTextConstruct] = attr.ib() + source: Optional['AtomFeed'] = attr.ib() + + +@attr.s +class AtomFeed: + title: Optional[AtomTextConstruct] = attr.ib() + id_: str = attr.ib() + + # Should be mandatory but many feeds do not include it + updated: Optional[datetime] = attr.ib() + + authors: List['AtomPerson'] = attr.ib() + contributors: List['AtomPerson'] = attr.ib() + links: List['AtomLink'] = attr.ib() + categories: List['AtomCategory'] = attr.ib() + generator: Optional['AtomGenerator'] = attr.ib() + subtitle: Optional[AtomTextConstruct] = attr.ib() + rights: Optional[AtomTextConstruct] = attr.ib() + icon: Optional[str] = attr.ib() + logo: Optional[str] = attr.ib() + + entries: List[AtomEntry] = attr.ib() + + +@attr.s +class AtomPerson: + name: str = attr.ib() + uri: Optional[str] = attr.ib() + email: Optional[str] = attr.ib() + + +@attr.s +class AtomLink: + href: str = attr.ib() + rel: Optional[str] = attr.ib() + type_: Optional[str] = attr.ib() + hreflang: Optional[str] = attr.ib() + title: Optional[str] = attr.ib() + length: Optional[int] = attr.ib() + + +@attr.s +class AtomCategory: + term: str = attr.ib() + scheme: Optional[str] = attr.ib() + label: Optional[str] = attr.ib() + + +@attr.s +class AtomGenerator: + name: str = attr.ib() + uri: Optional[str] = attr.ib() + version: Optional[str] = attr.ib() + + +def _get_generator(element: Element, name, + optional: bool=True) -> Optional[AtomGenerator]: + child = get_child(element, name, optional) + if child is None: + return None + + return AtomGenerator( + child.text.strip(), + child.attrib.get('uri'), + child.attrib.get('version'), + ) + + +def _get_text_construct(element: Element, name, + optional: bool=True) -> Optional[AtomTextConstruct]: + child = get_child(element, name, optional) + if child is None: + return None + + try: + text_type = AtomTextType(child.attrib['type']) + except KeyError: + text_type = AtomTextType.text + + try: + lang = child.lang + except AttributeError: + lang = None + + if child.text is None: + if optional: + return None + + raise FeedParseError( + 'Could not parse atom feed: "{}" text is required but is empty' + .format(name) + ) + + return AtomTextConstruct( + text_type, + lang, + child.text.strip() + ) + + +def _get_person(element: Element) -> Optional[AtomPerson]: + try: + return AtomPerson( + get_text(element, 'feed:name', optional=False), + get_text(element, 'feed:uri'), + get_text(element, 'feed:email') + ) + except FeedParseError: + return None + + +def _get_link(element: Element) -> AtomLink: + length = element.attrib.get('length') + length = int(length) if length else None + return AtomLink( + element.attrib['href'], + element.attrib.get('rel'), + element.attrib.get('type'), + element.attrib.get('hreflang'), + element.attrib.get('title'), + length + ) + + +def _get_category(element: Element) -> AtomCategory: + return AtomCategory( + element.attrib['term'], + element.attrib.get('scheme'), + element.attrib.get('label'), + ) + + +def _get_entry(element: Element, + default_authors: List[AtomPerson]) -> AtomEntry: + root = element + + # Mandatory + title = _get_text_construct(root, 'feed:title') + id_ = get_text(root, 'feed:id') + + # Optional + try: + source = _parse_atom(get_child(root, 'feed:source', optional=False), + parse_entries=False) + except FeedParseError: + source = None + source_authors = [] + else: + source_authors = source.authors + + authors = [_get_person(e) + for e in root.findall('feed:author', ns)] or default_authors + authors = [a for a in authors if a is not None] + authors = authors or default_authors or source_authors + + contributors = [_get_person(e) + for e in root.findall('feed:contributor', ns) if e] + contributors = [c for c in contributors if c is not None] + + links = [_get_link(e) for e in root.findall('feed:link', ns)] + categories = [_get_category(e) for e in root.findall('feed:category', ns)] + + updated = get_datetime(root, 'feed:updated') + published = get_datetime(root, 'feed:published') + rights = _get_text_construct(root, 'feed:rights') + summary = _get_text_construct(root, 'feed:summary') + content = _get_text_construct(root, 'feed:content') + + return AtomEntry( + title, + id_, + updated, + authors, + contributors, + links, + categories, + published, + rights, + summary, + content, + source + ) + + +def _parse_atom(root: Element, parse_entries: bool=True) -> AtomFeed: + # Mandatory + id_ = get_text(root, 'feed:id', optional=False) + + # Optional + title = _get_text_construct(root, 'feed:title') + updated = get_datetime(root, 'feed:updated') + authors = [_get_person(e) + for e in root.findall('feed:author', ns) if e] + authors = [a for a in authors if a is not None] + contributors = [_get_person(e) + for e in root.findall('feed:contributor', ns) if e] + contributors = [c for c in contributors if c is not None] + links = [_get_link(e) + for e in root.findall('feed:link', ns)] + categories = [_get_category(e) + for e in root.findall('feed:category', ns)] + + generator = _get_generator(root, 'feed:generator') + subtitle = _get_text_construct(root, 'feed:subtitle') + rights = _get_text_construct(root, 'feed:rights') + icon = get_text(root, 'feed:icon') + logo = get_text(root, 'feed:logo') + + if parse_entries: + entries = [_get_entry(e, authors) + for e in root.findall('feed:entry', ns)] + else: + entries = [] + + atom_feed = AtomFeed( + title, + id_, + updated, + authors, + contributors, + links, + categories, + generator, + subtitle, + rights, + icon, + logo, + entries + ) + return atom_feed + + +def parse_atom_file(filename: str) -> AtomFeed: + """Parse an Atom feed from a local XML file.""" + root = parse_xml(filename).getroot() + return _parse_atom(root) + + +def parse_atom_bytes(data: bytes) -> AtomFeed: + """Parse an Atom feed from a byte-string containing XML data.""" + root = parse_xml(BytesIO(data)).getroot() + return _parse_atom(root) |