sopel.modules.url.title_auto()   B
last analyzed

Complexity

Conditions 8

Size

Total Lines 26
Code Lines 16

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 16
dl 0
loc 26
rs 7.3333
c 0
b 0
f 0
cc 8
nop 2
1
# coding=utf-8
2
"""
3
url.py - Sopel URL Title Module
4
Copyright 2010-2011, Michael Yanovich (yanovich.net) & Kenneth Sham
5
Copyright 2012-2013, Elsie Powell
6
Copyright 2013, Lior Ramati <[email protected]>
7
Copyright 2014, Elad Alfassa <[email protected]>
8
Licensed under the Eiffel Forum License 2.
9
10
https://sopel.chat
11
"""
12
from __future__ import unicode_literals, absolute_import, print_function, division
13
14
import re
15
16
import dns.resolver
17
import ipaddress
18
import requests
19
20
from sopel import __version__, module, tools
21
from sopel.config.types import ListAttribute, StaticSection, ValidatedAttribute
22
from sopel.tools import web
23
24
# Python3 vs Python2
25
try:
26
    from urllib.parse import urlparse
27
except ImportError:
28
    from urlparse import urlparse
29
30
USER_AGENT = 'Sopel/{} (https://sopel.chat)'.format(__version__)
31
default_headers = {'User-Agent': USER_AGENT}
32
# These are used to clean up the title tag before actually parsing it. Not the
33
# world's best way to do this, but it'll do for now.
34
title_tag_data = re.compile('<(/?)title( [^>]+)?>', re.IGNORECASE)
35
quoted_title = re.compile('[\'"]<title>[\'"]', re.IGNORECASE)
36
# This is another regex that presumably does something important.
37
re_dcc = re.compile(r'(?i)dcc\ssend')
38
# This sets the maximum number of bytes that should be read in order to find
39
# the title. We don't want it too high, or a link to a big file/stream will
40
# just keep downloading until there's no more memory. 640k ought to be enough
41
# for anybody.
42
max_bytes = 655360
43
44
45
class UrlSection(StaticSection):
46
    # TODO some validation rules maybe?
47
    exclude = ListAttribute('exclude')
48
    """A list of regular expressions to match URLs for which the title should not be shown."""
49
    exclusion_char = ValidatedAttribute('exclusion_char', default='!')
50
    """A character (or string) which, when immediately preceding a URL, will stop that URL's title from being shown."""
51
    shorten_url_length = ValidatedAttribute(
52
        'shorten_url_length', int, default=0)
53
    """If greater than 0, the title fetcher will include a TinyURL version of links longer than this many characters."""
54
    enable_private_resolution = ValidatedAttribute(
55
        'enable_private_resolution', bool, default=False)
56
    """Enable URL lookups for RFC1918 addresses"""
57
    enable_dns_resolution = ValidatedAttribute(
58
        'enable_dns_resolution', bool, default=False)
59
    """Enable DNS resolution for all domains to validate if there are RFC1918 resolutions"""
60
61
62
def configure(config):
63
    """
64
    | name | example | purpose |
65
    | ---- | ------- | ------- |
66
    | exclude | https?://git\\\\.io/.* | A list of regular expressions for URLs for which the title should not be shown. |
67
    | exclusion\\_char | ! | A character (or string) which, when immediately preceding a URL, will stop the URL's title from being shown. |
68
    | shorten\\_url\\_length | 72 | If greater than 0, the title fetcher will include a TinyURL version of links longer than this many characters. |
69
    | enable\\_private\\_resolution | False | Enable URL lookups for RFC1918 addresses. |
70
    | enable\\_dns\\_resolution | False | Enable DNS resolution for all domains to validate if there are RFC1918 resolutions. |
71
    """
72
    config.define_section('url', UrlSection)
73
    config.url.configure_setting(
74
        'exclude',
75
        'Enter regular expressions for each URL you would like to exclude.'
76
    )
77
    config.url.configure_setting(
78
        'exclusion_char',
79
        'Enter a character which can be prefixed to suppress URL titling'
80
    )
81
    config.url.configure_setting(
82
        'shorten_url_length',
83
        'Enter how many characters a URL should be before the bot puts a'
84
        ' shorter version of the URL in the title as a TinyURL link'
85
        ' (0 to disable)'
86
    )
87
    config.url.configure_setting(
88
        'enable_private_resolution',
89
        'Enable URL lookups for RFC1918 addresses?'
90
    )
91
    config.url.configure_setting(
92
        'enable_dns_resolution',
93
        'Enable DNS resolution for all domains to validate if there are RFC1918 resolutions?'
94
    )
95
96
97
def setup(bot):
98
    bot.config.define_section('url', UrlSection)
99
100
    if bot.config.url.exclude:
101
        regexes = [re.compile(s) for s in bot.config.url.exclude]
102
    else:
103
        regexes = []
104
105
    # We're keeping these in their own list, rather than putting then in the
106
    # callbacks list because 1, it's easier to deal with modules that are still
107
    # using this list, and not the newer callbacks list and 2, having a lambda
108
    # just to pass is kinda ugly.
109
    if 'url_exclude' not in bot.memory:
110
        bot.memory['url_exclude'] = regexes
111
    else:
112
        exclude = bot.memory['url_exclude']
113
        if regexes:
114
            exclude.extend(regexes)
115
        bot.memory['url_exclude'] = exclude
116
117
    # Ensure last_seen_url is in memory
118
    if 'last_seen_url' not in bot.memory:
119
        bot.memory['last_seen_url'] = tools.SopelMemory()
120
121
    # Initialize shortened_urls as a dict if it doesn't exist.
122
    if 'shortened_urls' not in bot.memory:
123
        bot.memory['shortened_urls'] = tools.SopelMemory()
124
125
126
def shutdown(bot):
127
    # Unset `url_exclude` and `last_seen_url`, but not `shortened_urls`;
128
    # clearing `shortened_urls` will increase API calls. Leaving it in memory
129
    # should not lead to unexpected behavior.
130
    for key in ['url_exclude', 'last_seen_url']:
131
        try:
132
            del bot.memory[key]
133
        except KeyError:
134
            pass
135
136
137
@module.commands('title')
138
@module.example(
139
    '.title https://www.google.com',
140
    '[ Google ] - www.google.com',
141
    online=True)
142
def title_command(bot, trigger):
143
    """
144
    Show the title or URL information for the given URL, or the last URL seen
145
    in this channel.
146
    """
147
    if not trigger.group(2):
148
        if trigger.sender not in bot.memory['last_seen_url']:
149
            return
150
        matched = check_callbacks(
151
            bot, bot.memory['last_seen_url'][trigger.sender])
152
        if matched:
153
            return
154
        else:
155
            urls = [bot.memory['last_seen_url'][trigger.sender]]
156
    else:
157
        urls = web.search_urls(
158
            trigger,
159
            exclusion_char=bot.config.url.exclusion_char)
160
161
    for url, title, domain, tinyurl in process_urls(bot, trigger, urls):
162
        message = '[ %s ] - %s' % (title, domain)
163
        if tinyurl:
164
            message += ' ( %s )' % tinyurl
165
        bot.reply(message)
166
        bot.memory['last_seen_url'][trigger.sender] = url
167
168
169
@module.rule(r'(?u).*(https?://\S+).*')
170
def title_auto(bot, trigger):
171
    """
172
    Automatically show titles for URLs. For shortened URLs/redirects, find
173
    where the URL redirects to and show the title for that (or call a function
174
    from another module to give more information).
175
    """
176
    if re.match(bot.config.core.prefix + 'title', trigger):
177
        return
178
179
    # Avoid fetching known malicious links
180
    if 'safety_cache' in bot.memory and trigger in bot.memory['safety_cache']:
181
        if bot.memory['safety_cache'][trigger]['positives'] > 1:
182
            return
183
184
    urls = web.search_urls(
185
        trigger, exclusion_char=bot.config.url.exclusion_char, clean=True)
186
187
    for url, title, domain, tinyurl in process_urls(bot, trigger, urls):
188
        message = '[ %s ] - %s' % (title, domain)
189
        if tinyurl:
190
            message += ' ( %s )' % tinyurl
191
        # Guard against responding to other instances of this bot.
192
        if message != trigger:
193
            bot.say(message)
194
            bot.memory['last_seen_url'][trigger.sender] = url
195
196
197
def process_urls(bot, trigger, urls):
198
    """
199
    For each URL in the list, ensure that it isn't handled by another module.
200
    If not, find where it redirects to, if anywhere. If that redirected URL
201
    should be handled by another module, dispatch the callback for it.
202
    Return a list of (title, hostname) tuples for each URL which is not handled
203
    by another module.
204
    """
205
    shorten_url_length = bot.config.url.shorten_url_length
206
    for url in urls:
207
        # Exclude URLs that start with the exclusion char
208
        if url.startswith(bot.config.url.exclusion_char):
209
            continue
210
211
        # Check the URL does not match an existing URL callback
212
        if check_callbacks(bot, url):
213
            continue
214
215
        # Prevent private addresses from being queried if enable_private_resolution is False
216
        if not bot.config.url.enable_private_resolution:
217
            parsed = urlparse(url)
218
            # Check if it's an address like http://192.168.1.1
219
            try:
220
                if ipaddress.ip_address(parsed.hostname).is_private or ipaddress.ip_address(parsed.hostname).is_loopback:
221
                    continue
222
            except ValueError:
223
                pass
224
225
            # Check if domains are RFC1918 addresses if enable_dns_resolutions is set
226
            if bot.config.url.enable_dns_resolution:
227
                private = False
228
                for result in dns.resolver.query(parsed.hostname):
229
                    if ipaddress.ip_address(result).is_private or ipaddress.ip_address(parsed.hostname).is_loopback:
230
                        private = True
231
                        break
232
                if private:
233
                    continue
234
235
        # Call the URL to get a title, if possible
236
        title = find_title(url, verify=bot.config.core.verify_ssl)
237
        if not title:
238
            # No title found: don't handle this URL
239
            continue
240
241
        # If the URL is over bot.config.url.shorten_url_length, shorten the URL
242
        tinyurl = None
243
        if (shorten_url_length > 0) and (len(url) > shorten_url_length):
244
            tinyurl = get_or_create_shorturl(bot, url)
245
246
        yield (url, title, get_hostname(url), tinyurl)
247
248
249
def check_callbacks(bot, url):
250
    """Check if ``url`` is excluded or matches any URL callback patterns.
251
252
    :param bot: Sopel instance
253
    :param str url: URL to check
254
    :return: True if ``url`` is excluded or matches any URL Callback pattern
255
256
    This function looks at the ``bot.memory`` for ``url_exclude`` patterns and
257
    it returns ``True`` if any matches the given ``url``. Otherwise, it looks
258
    at the ``bot``'s URL Callback patterns, and it returns ``True`` if any
259
    matches, ``False`` otherwise.
260
261
    .. seealso::
262
263
        The :func:`~sopel.modules.url.setup` function that defines the
264
        ``url_exclude`` in ``bot.memory``.
265
266
    .. versionchanged:: 7.0
267
268
        This function **does not** trigger URL callbacks anymore when ``url``
269
        matches a pattern.
270
271
    """
272
    # Check if it matches the exclusion list first
273
    matched = any(regex.search(url) for regex in bot.memory['url_exclude'])
274
    return matched or any(bot.search_url_callbacks(url))
275
276
277
def find_title(url, verify=True):
278
    """Return the title for the given URL."""
279
    try:
280
        response = requests.get(url, stream=True, verify=verify,
281
                                headers=default_headers)
282
        content = b''
283
        for byte in response.iter_content(chunk_size=512):
284
            content += byte
285
            if b'</title>' in content or len(content) > max_bytes:
286
                break
287
        content = content.decode('utf-8', errors='ignore')
288
        # Need to close the connection because we have not read all
289
        # the data
290
        response.close()
291
    except requests.exceptions.ConnectionError:
292
        return None
293
294
    # Some cleanup that I don't really grok, but was in the original, so
295
    # we'll keep it (with the compiled regexes made global) for now.
296
    content = title_tag_data.sub(r'<\1title>', content)
297
    content = quoted_title.sub('', content)
298
299
    start = content.rfind('<title>')
300
    end = content.rfind('</title>')
301
    if start == -1 or end == -1:
302
        return
303
    title = web.decode(content[start + 7:end])
304
    title = title.strip()[:200]
305
306
    title = ' '.join(title.split())  # cleanly remove multiple spaces
307
308
    # More cryptic regex substitutions. This one looks to be myano's invention.
309
    title = re_dcc.sub('', title)
310
311
    return title or None
312
313
314
def get_hostname(url):
315
    idx = 7
316
    if url.startswith('https://'):
317
        idx = 8
318
    elif url.startswith('ftp://'):
319
        idx = 6
320
    hostname = url[idx:]
321
    slash = hostname.find('/')
322
    if slash != -1:
323
        hostname = hostname[:slash]
324
    return hostname
325
326
327
def get_or_create_shorturl(bot, url):
328
    """Get or create a short URL for ``url``
329
330
    :param bot: Sopel instance
331
    :param str url: URL to get or create a short URL for
332
    :return: A short URL
333
    :rtype: str
334
335
    It gets the short URL for ``url`` from the bot's memory if it exists.
336
    Otherwise, it creates a short URL (see :func:`get_tinyurl`), stores it
337
    into the bot's memory, then returns it.
338
    """
339
    # Check bot memory to see if the shortened URL is already in
340
    # memory
341
    if url in bot.memory['shortened_urls']:
342
        return bot.memory['shortened_urls'][url]
343
344
    tinyurl = get_tinyurl(url)
345
    bot.memory['shortened_urls'][url] = tinyurl
346
    return tinyurl
347
348
349
def get_tinyurl(url):
350
    """Returns a shortened tinyURL link of the URL"""
351
    base_url = "https://tinyurl.com/api-create.php"
352
    tinyurl = "%s?%s" % (base_url, web.urlencode({'url': url}))
353
    try:
354
        res = requests.get(tinyurl)
355
        res.raise_for_status()
356
    except requests.exceptions.RequestException:
357
        return None
358
    # Replace text output with https instead of http to make the
359
    # result an HTTPS link.
360
    return res.text.replace("http://", "https://")
361
362
363
if __name__ == "__main__":
364
    from sopel.test_tools import run_example_tests
365
    run_example_tests(__file__)
366