aboutsummaryrefslogtreecommitdiffstats
path: root/python/atoma/atom.py
diff options
context:
space:
mode:
authorJames Taylor <user234683@users.noreply.github.com>2019-09-06 15:45:01 -0700
committerJames Taylor <user234683@users.noreply.github.com>2019-09-06 15:45:01 -0700
commitac32b24b2a011292b704a3f27e8fd08a7ae9424b (patch)
tree0d6e021519dee62089733e20880c65cdb85d8841 /python/atoma/atom.py
parent7a93acabb3f5a8dd95ec0d56ae57cc34eb57c1b8 (diff)
parentc393031ac54af959561214c8b1d6b22647a81b89 (diff)
downloadyt-local-ac32b24b2a011292b704a3f27e8fd08a7ae9424b.tar.lz
yt-local-ac32b24b2a011292b704a3f27e8fd08a7ae9424b.tar.xz
yt-local-ac32b24b2a011292b704a3f27e8fd08a7ae9424b.zip
Merge subscriptions into master
Diffstat (limited to 'python/atoma/atom.py')
-rw-r--r--python/atoma/atom.py284
1 files changed, 284 insertions, 0 deletions
diff --git a/python/atoma/atom.py b/python/atoma/atom.py
new file mode 100644
index 0000000..d4e676c
--- /dev/null
+++ b/python/atoma/atom.py
@@ -0,0 +1,284 @@
+from datetime import datetime
+import enum
+from io import BytesIO
+from typing import Optional, List
+from xml.etree.ElementTree import Element
+
+import attr
+
+from .utils import (
+ parse_xml, get_child, get_text, get_datetime, FeedParseError, ns
+)
+
+
+class AtomTextType(enum.Enum):
+ text = "text"
+ html = "html"
+ xhtml = "xhtml"
+
+
+@attr.s
+class AtomTextConstruct:
+ text_type: str = attr.ib()
+ lang: Optional[str] = attr.ib()
+ value: str = attr.ib()
+
+
+@attr.s
+class AtomEntry:
+ title: AtomTextConstruct = attr.ib()
+ id_: str = attr.ib()
+
+ # Should be mandatory but many feeds use published instead
+ updated: Optional[datetime] = attr.ib()
+
+ authors: List['AtomPerson'] = attr.ib()
+ contributors: List['AtomPerson'] = attr.ib()
+ links: List['AtomLink'] = attr.ib()
+ categories: List['AtomCategory'] = attr.ib()
+ published: Optional[datetime] = attr.ib()
+ rights: Optional[AtomTextConstruct] = attr.ib()
+ summary: Optional[AtomTextConstruct] = attr.ib()
+ content: Optional[AtomTextConstruct] = attr.ib()
+ source: Optional['AtomFeed'] = attr.ib()
+
+
+@attr.s
+class AtomFeed:
+ title: Optional[AtomTextConstruct] = attr.ib()
+ id_: str = attr.ib()
+
+ # Should be mandatory but many feeds do not include it
+ updated: Optional[datetime] = attr.ib()
+
+ authors: List['AtomPerson'] = attr.ib()
+ contributors: List['AtomPerson'] = attr.ib()
+ links: List['AtomLink'] = attr.ib()
+ categories: List['AtomCategory'] = attr.ib()
+ generator: Optional['AtomGenerator'] = attr.ib()
+ subtitle: Optional[AtomTextConstruct] = attr.ib()
+ rights: Optional[AtomTextConstruct] = attr.ib()
+ icon: Optional[str] = attr.ib()
+ logo: Optional[str] = attr.ib()
+
+ entries: List[AtomEntry] = attr.ib()
+
+
+@attr.s
+class AtomPerson:
+ name: str = attr.ib()
+ uri: Optional[str] = attr.ib()
+ email: Optional[str] = attr.ib()
+
+
+@attr.s
+class AtomLink:
+ href: str = attr.ib()
+ rel: Optional[str] = attr.ib()
+ type_: Optional[str] = attr.ib()
+ hreflang: Optional[str] = attr.ib()
+ title: Optional[str] = attr.ib()
+ length: Optional[int] = attr.ib()
+
+
+@attr.s
+class AtomCategory:
+ term: str = attr.ib()
+ scheme: Optional[str] = attr.ib()
+ label: Optional[str] = attr.ib()
+
+
+@attr.s
+class AtomGenerator:
+ name: str = attr.ib()
+ uri: Optional[str] = attr.ib()
+ version: Optional[str] = attr.ib()
+
+
+def _get_generator(element: Element, name,
+ optional: bool=True) -> Optional[AtomGenerator]:
+ child = get_child(element, name, optional)
+ if child is None:
+ return None
+
+ return AtomGenerator(
+ child.text.strip(),
+ child.attrib.get('uri'),
+ child.attrib.get('version'),
+ )
+
+
+def _get_text_construct(element: Element, name,
+ optional: bool=True) -> Optional[AtomTextConstruct]:
+ child = get_child(element, name, optional)
+ if child is None:
+ return None
+
+ try:
+ text_type = AtomTextType(child.attrib['type'])
+ except KeyError:
+ text_type = AtomTextType.text
+
+ try:
+ lang = child.lang
+ except AttributeError:
+ lang = None
+
+ if child.text is None:
+ if optional:
+ return None
+
+ raise FeedParseError(
+ 'Could not parse atom feed: "{}" text is required but is empty'
+ .format(name)
+ )
+
+ return AtomTextConstruct(
+ text_type,
+ lang,
+ child.text.strip()
+ )
+
+
+def _get_person(element: Element) -> Optional[AtomPerson]:
+ try:
+ return AtomPerson(
+ get_text(element, 'feed:name', optional=False),
+ get_text(element, 'feed:uri'),
+ get_text(element, 'feed:email')
+ )
+ except FeedParseError:
+ return None
+
+
+def _get_link(element: Element) -> AtomLink:
+ length = element.attrib.get('length')
+ length = int(length) if length else None
+ return AtomLink(
+ element.attrib['href'],
+ element.attrib.get('rel'),
+ element.attrib.get('type'),
+ element.attrib.get('hreflang'),
+ element.attrib.get('title'),
+ length
+ )
+
+
+def _get_category(element: Element) -> AtomCategory:
+ return AtomCategory(
+ element.attrib['term'],
+ element.attrib.get('scheme'),
+ element.attrib.get('label'),
+ )
+
+
+def _get_entry(element: Element,
+ default_authors: List[AtomPerson]) -> AtomEntry:
+ root = element
+
+ # Mandatory
+ title = _get_text_construct(root, 'feed:title')
+ id_ = get_text(root, 'feed:id')
+
+ # Optional
+ try:
+ source = _parse_atom(get_child(root, 'feed:source', optional=False),
+ parse_entries=False)
+ except FeedParseError:
+ source = None
+ source_authors = []
+ else:
+ source_authors = source.authors
+
+ authors = [_get_person(e)
+ for e in root.findall('feed:author', ns)] or default_authors
+ authors = [a for a in authors if a is not None]
+ authors = authors or default_authors or source_authors
+
+ contributors = [_get_person(e)
+ for e in root.findall('feed:contributor', ns) if e]
+ contributors = [c for c in contributors if c is not None]
+
+ links = [_get_link(e) for e in root.findall('feed:link', ns)]
+ categories = [_get_category(e) for e in root.findall('feed:category', ns)]
+
+ updated = get_datetime(root, 'feed:updated')
+ published = get_datetime(root, 'feed:published')
+ rights = _get_text_construct(root, 'feed:rights')
+ summary = _get_text_construct(root, 'feed:summary')
+ content = _get_text_construct(root, 'feed:content')
+
+ return AtomEntry(
+ title,
+ id_,
+ updated,
+ authors,
+ contributors,
+ links,
+ categories,
+ published,
+ rights,
+ summary,
+ content,
+ source
+ )
+
+
+def _parse_atom(root: Element, parse_entries: bool=True) -> AtomFeed:
+ # Mandatory
+ id_ = get_text(root, 'feed:id', optional=False)
+
+ # Optional
+ title = _get_text_construct(root, 'feed:title')
+ updated = get_datetime(root, 'feed:updated')
+ authors = [_get_person(e)
+ for e in root.findall('feed:author', ns) if e]
+ authors = [a for a in authors if a is not None]
+ contributors = [_get_person(e)
+ for e in root.findall('feed:contributor', ns) if e]
+ contributors = [c for c in contributors if c is not None]
+ links = [_get_link(e)
+ for e in root.findall('feed:link', ns)]
+ categories = [_get_category(e)
+ for e in root.findall('feed:category', ns)]
+
+ generator = _get_generator(root, 'feed:generator')
+ subtitle = _get_text_construct(root, 'feed:subtitle')
+ rights = _get_text_construct(root, 'feed:rights')
+ icon = get_text(root, 'feed:icon')
+ logo = get_text(root, 'feed:logo')
+
+ if parse_entries:
+ entries = [_get_entry(e, authors)
+ for e in root.findall('feed:entry', ns)]
+ else:
+ entries = []
+
+ atom_feed = AtomFeed(
+ title,
+ id_,
+ updated,
+ authors,
+ contributors,
+ links,
+ categories,
+ generator,
+ subtitle,
+ rights,
+ icon,
+ logo,
+ entries
+ )
+ return atom_feed
+
+
+def parse_atom_file(filename: str) -> AtomFeed:
+ """Parse an Atom feed from a local XML file."""
+ root = parse_xml(filename).getroot()
+ return _parse_atom(root)
+
+
+def parse_atom_bytes(data: bytes) -> AtomFeed:
+ """Parse an Atom feed from a byte-string containing XML data."""
+ root = parse_xml(BytesIO(data)).getroot()
+ return _parse_atom(root)