diff options
Diffstat (limited to 'python/atoma/rss.py')
-rw-r--r-- | python/atoma/rss.py | 221 |
1 files changed, 0 insertions, 221 deletions
diff --git a/python/atoma/rss.py b/python/atoma/rss.py deleted file mode 100644 index f447a2f..0000000 --- a/python/atoma/rss.py +++ /dev/null @@ -1,221 +0,0 @@ -from datetime import datetime -from io import BytesIO -from typing import Optional, List -from xml.etree.ElementTree import Element - -import attr - -from .utils import ( - parse_xml, get_child, get_text, get_int, get_datetime, FeedParseError -) - - -@attr.s -class RSSImage: - url: str = attr.ib() - title: Optional[str] = attr.ib() - link: str = attr.ib() - width: int = attr.ib() - height: int = attr.ib() - description: Optional[str] = attr.ib() - - -@attr.s -class RSSEnclosure: - url: str = attr.ib() - length: Optional[int] = attr.ib() - type: Optional[str] = attr.ib() - - -@attr.s -class RSSSource: - title: str = attr.ib() - url: Optional[str] = attr.ib() - - -@attr.s -class RSSItem: - title: Optional[str] = attr.ib() - link: Optional[str] = attr.ib() - description: Optional[str] = attr.ib() - author: Optional[str] = attr.ib() - categories: List[str] = attr.ib() - comments: Optional[str] = attr.ib() - enclosures: List[RSSEnclosure] = attr.ib() - guid: Optional[str] = attr.ib() - pub_date: Optional[datetime] = attr.ib() - source: Optional[RSSSource] = attr.ib() - - # Extension - content_encoded: Optional[str] = attr.ib() - - -@attr.s -class RSSChannel: - title: Optional[str] = attr.ib() - link: Optional[str] = attr.ib() - description: Optional[str] = attr.ib() - language: Optional[str] = attr.ib() - copyright: Optional[str] = attr.ib() - managing_editor: Optional[str] = attr.ib() - web_master: Optional[str] = attr.ib() - pub_date: Optional[datetime] = attr.ib() - last_build_date: Optional[datetime] = attr.ib() - categories: List[str] = attr.ib() - generator: Optional[str] = attr.ib() - docs: Optional[str] = attr.ib() - ttl: Optional[int] = attr.ib() - image: Optional[RSSImage] = attr.ib() - - items: List[RSSItem] = attr.ib() - - # Extension - content_encoded: Optional[str] = attr.ib() - - -def _get_image(element: Element, name, - optional: bool=True) -> Optional[RSSImage]: - child = get_child(element, name, optional) - if child is None: - return None - - return RSSImage( - get_text(child, 'url', optional=False), - get_text(child, 'title'), - get_text(child, 'link', optional=False), - get_int(child, 'width') or 88, - get_int(child, 'height') or 31, - get_text(child, 'description') - ) - - -def _get_source(element: Element, name, - optional: bool=True) -> Optional[RSSSource]: - child = get_child(element, name, optional) - if child is None: - return None - - return RSSSource( - child.text.strip(), - child.attrib.get('url'), - ) - - -def _get_enclosure(element: Element) -> RSSEnclosure: - length = element.attrib.get('length') - try: - length = int(length) - except (TypeError, ValueError): - length = None - - return RSSEnclosure( - element.attrib['url'], - length, - element.attrib.get('type'), - ) - - -def _get_link(element: Element) -> Optional[str]: - """Attempt to retrieve item link. - - Use the GUID as a fallback if it is a permalink. - """ - link = get_text(element, 'link') - if link is not None: - return link - - guid = get_child(element, 'guid') - if guid is not None and guid.attrib.get('isPermaLink') == 'true': - return get_text(element, 'guid') - - return None - - -def _get_item(element: Element) -> RSSItem: - root = element - - title = get_text(root, 'title') - link = _get_link(root) - description = get_text(root, 'description') - author = get_text(root, 'author') - categories = [e.text for e in root.findall('category')] - comments = get_text(root, 'comments') - enclosure = [_get_enclosure(e) for e in root.findall('enclosure')] - guid = get_text(root, 'guid') - pub_date = get_datetime(root, 'pubDate') - source = _get_source(root, 'source') - - content_encoded = get_text(root, 'content:encoded') - - return RSSItem( - title, - link, - description, - author, - categories, - comments, - enclosure, - guid, - pub_date, - source, - content_encoded - ) - - -def _parse_rss(root: Element) -> RSSChannel: - rss_version = root.get('version') - if rss_version != '2.0': - raise FeedParseError('Cannot process RSS feed version "{}"' - .format(rss_version)) - - root = root.find('channel') - - title = get_text(root, 'title') - link = get_text(root, 'link') - description = get_text(root, 'description') - language = get_text(root, 'language') - copyright = get_text(root, 'copyright') - managing_editor = get_text(root, 'managingEditor') - web_master = get_text(root, 'webMaster') - pub_date = get_datetime(root, 'pubDate') - last_build_date = get_datetime(root, 'lastBuildDate') - categories = [e.text for e in root.findall('category')] - generator = get_text(root, 'generator') - docs = get_text(root, 'docs') - ttl = get_int(root, 'ttl') - - image = _get_image(root, 'image') - items = [_get_item(e) for e in root.findall('item')] - - content_encoded = get_text(root, 'content:encoded') - - return RSSChannel( - title, - link, - description, - language, - copyright, - managing_editor, - web_master, - pub_date, - last_build_date, - categories, - generator, - docs, - ttl, - image, - items, - content_encoded - ) - - -def parse_rss_file(filename: str) -> RSSChannel: - """Parse an RSS feed from a local XML file.""" - root = parse_xml(filename).getroot() - return _parse_rss(root) - - -def parse_rss_bytes(data: bytes) -> RSSChannel: - """Parse an RSS feed from a byte-string containing XML data.""" - root = parse_xml(BytesIO(data)).getroot() - return _parse_rss(root) |