python/atoma/simple.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224

"""Simple API that abstracts away the differences between feed types."""

from datetime import datetime, timedelta
import html
import os
from typing import Optional, List, Tuple
import urllib.parse

import attr

from . import atom, rss, json_feed
from .exceptions import (
    FeedParseError, FeedDocumentError, FeedXMLError, FeedJSONError
)


@attr.s
class Attachment:
    link: str = attr.ib()
    mime_type: Optional[str] = attr.ib()
    title: Optional[str] = attr.ib()
    size_in_bytes: Optional[int] = attr.ib()
    duration: Optional[timedelta] = attr.ib()


@attr.s
class Article:
    id: str = attr.ib()
    title: Optional[str] = attr.ib()
    link: Optional[str] = attr.ib()
    content: str = attr.ib()
    published_at: Optional[datetime] = attr.ib()
    updated_at: Optional[datetime] = attr.ib()
    attachments: List[Attachment] = attr.ib()


@attr.s
class Feed:
    title: str = attr.ib()
    subtitle: Optional[str] = attr.ib()
    link: Optional[str] = attr.ib()
    updated_at: Optional[datetime] = attr.ib()
    articles: List[Article] = attr.ib()


def _adapt_atom_feed(atom_feed: atom.AtomFeed) -> Feed:
    articles = list()
    for entry in atom_feed.entries:
        if entry.content is not None:
            content = entry.content.value
        elif entry.summary is not None:
            content = entry.summary.value
        else:
            content = ''
        published_at, updated_at = _get_article_dates(entry.published,
                                                      entry.updated)
        # Find article link and attachments
        article_link = None
        attachments = list()
        for candidate_link in entry.links:
            if candidate_link.rel in ('alternate', None):
                article_link = candidate_link.href
            elif candidate_link.rel == 'enclosure':
                attachments.append(Attachment(
                    title=_get_attachment_title(candidate_link.title,
                                                candidate_link.href),
                    link=candidate_link.href,
                    mime_type=candidate_link.type_,
                    size_in_bytes=candidate_link.length,
                    duration=None
                ))

        if entry.title is None:
            entry_title = None
        elif entry.title.text_type in (atom.AtomTextType.html,
                                       atom.AtomTextType.xhtml):
            entry_title = html.unescape(entry.title.value).strip()
        else:
            entry_title = entry.title.value

        articles.append(Article(
            entry.id_,
            entry_title,
            article_link,
            content,
            published_at,
            updated_at,
            attachments
        ))

    # Find feed link
    link = None
    for candidate_link in atom_feed.links:
        if candidate_link.rel == 'self':
            link = candidate_link.href
            break

    return Feed(
        atom_feed.title.value if atom_feed.title else atom_feed.id_,
        atom_feed.subtitle.value if atom_feed.subtitle else None,
        link,
        atom_feed.updated,
        articles
    )


def _adapt_rss_channel(rss_channel: rss.RSSChannel) -> Feed:
    articles = list()
    for item in rss_channel.items:
        attachments = [
            Attachment(link=e.url, mime_type=e.type, size_in_bytes=e.length,
                       title=_get_attachment_title(None, e.url), duration=None)
            for e in item.enclosures
        ]
        articles.append(Article(
            item.guid or item.link,
            item.title,
            item.link,
            item.content_encoded or item.description or '',
            item.pub_date,
            None,
            attachments
        ))

    if rss_channel.title is None and rss_channel.link is None:
        raise FeedParseError('RSS feed does not have a title nor a link')

    return Feed(
        rss_channel.title if rss_channel.title else rss_channel.link,
        rss_channel.description,
        rss_channel.link,
        rss_channel.pub_date,
        articles
    )


def _adapt_json_feed(json_feed: json_feed.JSONFeed) -> Feed:
    articles = list()
    for item in json_feed.items:
        attachments = [
            Attachment(a.url, a.mime_type,
                       _get_attachment_title(a.title, a.url),
                       a.size_in_bytes, a.duration)
            for a in item.attachments
        ]
        articles.append(Article(
            item.id_,
            item.title,
            item.url,
            item.content_html or item.content_text or '',
            item.date_published,
            item.date_modified,
            attachments
        ))

    return Feed(
        json_feed.title,
        json_feed.description,
        json_feed.feed_url,
        None,
        articles
    )


def _get_article_dates(published_at: Optional[datetime],
                       updated_at: Optional[datetime]
                       ) -> Tuple[Optional[datetime], Optional[datetime]]:
    if published_at and updated_at:
        return published_at, updated_at

    if updated_at:
        return updated_at, None

    if published_at:
        return published_at, None

    raise FeedParseError('Article does not have proper dates')


def _get_attachment_title(attachment_title: Optional[str], link: str) -> str:
    if attachment_title:
        return attachment_title

    parsed_link = urllib.parse.urlparse(link)
    return os.path.basename(parsed_link.path)


def _simple_parse(pairs, content) -> Feed:
    is_xml = True
    is_json = True
    for parser, adapter in pairs:
        try:
            return adapter(parser(content))
        except FeedXMLError:
            is_xml = False
        except FeedJSONError:
            is_json = False
        except FeedParseError:
            continue

    if not is_xml and not is_json:
        raise FeedDocumentError('File is not a supported feed type')

    raise FeedParseError('File is not a valid supported feed')


def simple_parse_file(filename: str) -> Feed:
    """Parse an Atom, RSS or JSON feed from a local file."""
    pairs = (
        (rss.parse_rss_file, _adapt_rss_channel),
        (atom.parse_atom_file, _adapt_atom_feed),
        (json_feed.parse_json_feed_file, _adapt_json_feed)
    )
    return _simple_parse(pairs, filename)


def simple_parse_bytes(data: bytes) -> Feed:
    """Parse an Atom, RSS or JSON feed from a byte-string containing data."""
    pairs = (
        (rss.parse_rss_bytes, _adapt_rss_channel),
        (atom.parse_atom_bytes, _adapt_atom_feed),
        (json_feed.parse_json_feed_bytes, _adapt_json_feed)
    )
    return _simple_parse(pairs, data)