|
1
|
|
|
# coding=utf-8 |
|
2
|
|
|
""" |
|
3
|
|
|
url.py - Sopel URL Title Module |
|
4
|
|
|
Copyright 2010-2011, Michael Yanovich (yanovich.net) & Kenneth Sham |
|
5
|
|
|
Copyright 2012-2013, Elsie Powell |
|
6
|
|
|
Copyright 2013, Lior Ramati <[email protected]> |
|
7
|
|
|
Copyright 2014, Elad Alfassa <[email protected]> |
|
8
|
|
|
Licensed under the Eiffel Forum License 2. |
|
9
|
|
|
|
|
10
|
|
|
https://sopel.chat |
|
11
|
|
|
""" |
|
12
|
|
|
from __future__ import unicode_literals, absolute_import, print_function, division |
|
13
|
|
|
|
|
14
|
|
|
import re |
|
15
|
|
|
|
|
16
|
|
|
import dns.resolver |
|
17
|
|
|
import ipaddress |
|
18
|
|
|
import requests |
|
19
|
|
|
|
|
20
|
|
|
from sopel import __version__, module, tools |
|
21
|
|
|
from sopel.config.types import ListAttribute, StaticSection, ValidatedAttribute |
|
22
|
|
|
from sopel.tools import web |
|
23
|
|
|
|
|
24
|
|
|
# Python3 vs Python2 |
|
25
|
|
|
try: |
|
26
|
|
|
from urllib.parse import urlparse |
|
27
|
|
|
except ImportError: |
|
28
|
|
|
from urlparse import urlparse |
|
29
|
|
|
|
|
30
|
|
|
USER_AGENT = 'Sopel/{} (https://sopel.chat)'.format(__version__) |
|
31
|
|
|
default_headers = {'User-Agent': USER_AGENT} |
|
32
|
|
|
# These are used to clean up the title tag before actually parsing it. Not the |
|
33
|
|
|
# world's best way to do this, but it'll do for now. |
|
34
|
|
|
title_tag_data = re.compile('<(/?)title( [^>]+)?>', re.IGNORECASE) |
|
35
|
|
|
quoted_title = re.compile('[\'"]<title>[\'"]', re.IGNORECASE) |
|
36
|
|
|
# This is another regex that presumably does something important. |
|
37
|
|
|
re_dcc = re.compile(r'(?i)dcc\ssend') |
|
38
|
|
|
# This sets the maximum number of bytes that should be read in order to find |
|
39
|
|
|
# the title. We don't want it too high, or a link to a big file/stream will |
|
40
|
|
|
# just keep downloading until there's no more memory. 640k ought to be enough |
|
41
|
|
|
# for anybody. |
|
42
|
|
|
max_bytes = 655360 |
|
43
|
|
|
|
|
44
|
|
|
|
|
45
|
|
|
class UrlSection(StaticSection): |
|
46
|
|
|
# TODO some validation rules maybe? |
|
47
|
|
|
exclude = ListAttribute('exclude') |
|
48
|
|
|
"""A list of regular expressions to match URLs for which the title should not be shown.""" |
|
49
|
|
|
exclusion_char = ValidatedAttribute('exclusion_char', default='!') |
|
50
|
|
|
"""A character (or string) which, when immediately preceding a URL, will stop that URL's title from being shown.""" |
|
51
|
|
|
shorten_url_length = ValidatedAttribute( |
|
52
|
|
|
'shorten_url_length', int, default=0) |
|
53
|
|
|
"""If greater than 0, the title fetcher will include a TinyURL version of links longer than this many characters.""" |
|
54
|
|
|
enable_private_resolution = ValidatedAttribute( |
|
55
|
|
|
'enable_private_resolution', bool, default=False) |
|
56
|
|
|
"""Enable URL lookups for RFC1918 addresses""" |
|
57
|
|
|
enable_dns_resolution = ValidatedAttribute( |
|
58
|
|
|
'enable_dns_resolution', bool, default=False) |
|
59
|
|
|
"""Enable DNS resolution for all domains to validate if there are RFC1918 resolutions""" |
|
60
|
|
|
|
|
61
|
|
|
|
|
62
|
|
|
def configure(config): |
|
63
|
|
|
""" |
|
64
|
|
|
| name | example | purpose | |
|
65
|
|
|
| ---- | ------- | ------- | |
|
66
|
|
|
| exclude | https?://git\\\\.io/.* | A list of regular expressions for URLs for which the title should not be shown. | |
|
67
|
|
|
| exclusion\\_char | ! | A character (or string) which, when immediately preceding a URL, will stop the URL's title from being shown. | |
|
68
|
|
|
| shorten\\_url\\_length | 72 | If greater than 0, the title fetcher will include a TinyURL version of links longer than this many characters. | |
|
69
|
|
|
| enable\\_private\\_resolution | False | Enable URL lookups for RFC1918 addresses. | |
|
70
|
|
|
| enable\\_dns\\_resolution | False | Enable DNS resolution for all domains to validate if there are RFC1918 resolutions. | |
|
71
|
|
|
""" |
|
72
|
|
|
config.define_section('url', UrlSection) |
|
73
|
|
|
config.url.configure_setting( |
|
74
|
|
|
'exclude', |
|
75
|
|
|
'Enter regular expressions for each URL you would like to exclude.' |
|
76
|
|
|
) |
|
77
|
|
|
config.url.configure_setting( |
|
78
|
|
|
'exclusion_char', |
|
79
|
|
|
'Enter a character which can be prefixed to suppress URL titling' |
|
80
|
|
|
) |
|
81
|
|
|
config.url.configure_setting( |
|
82
|
|
|
'shorten_url_length', |
|
83
|
|
|
'Enter how many characters a URL should be before the bot puts a' |
|
84
|
|
|
' shorter version of the URL in the title as a TinyURL link' |
|
85
|
|
|
' (0 to disable)' |
|
86
|
|
|
) |
|
87
|
|
|
config.url.configure_setting( |
|
88
|
|
|
'enable_private_resolution', |
|
89
|
|
|
'Enable URL lookups for RFC1918 addresses?' |
|
90
|
|
|
) |
|
91
|
|
|
config.url.configure_setting( |
|
92
|
|
|
'enable_dns_resolution', |
|
93
|
|
|
'Enable DNS resolution for all domains to validate if there are RFC1918 resolutions?' |
|
94
|
|
|
) |
|
95
|
|
|
|
|
96
|
|
|
|
|
97
|
|
|
def setup(bot): |
|
98
|
|
|
bot.config.define_section('url', UrlSection) |
|
99
|
|
|
|
|
100
|
|
|
if bot.config.url.exclude: |
|
101
|
|
|
regexes = [re.compile(s) for s in bot.config.url.exclude] |
|
102
|
|
|
else: |
|
103
|
|
|
regexes = [] |
|
104
|
|
|
|
|
105
|
|
|
# We're keeping these in their own list, rather than putting then in the |
|
106
|
|
|
# callbacks list because 1, it's easier to deal with modules that are still |
|
107
|
|
|
# using this list, and not the newer callbacks list and 2, having a lambda |
|
108
|
|
|
# just to pass is kinda ugly. |
|
109
|
|
|
if 'url_exclude' not in bot.memory: |
|
110
|
|
|
bot.memory['url_exclude'] = regexes |
|
111
|
|
|
else: |
|
112
|
|
|
exclude = bot.memory['url_exclude'] |
|
113
|
|
|
if regexes: |
|
114
|
|
|
exclude.extend(regexes) |
|
115
|
|
|
bot.memory['url_exclude'] = exclude |
|
116
|
|
|
|
|
117
|
|
|
# Ensure last_seen_url is in memory |
|
118
|
|
|
if 'last_seen_url' not in bot.memory: |
|
119
|
|
|
bot.memory['last_seen_url'] = tools.SopelMemory() |
|
120
|
|
|
|
|
121
|
|
|
# Initialize shortened_urls as a dict if it doesn't exist. |
|
122
|
|
|
if 'shortened_urls' not in bot.memory: |
|
123
|
|
|
bot.memory['shortened_urls'] = tools.SopelMemory() |
|
124
|
|
|
|
|
125
|
|
|
|
|
126
|
|
|
def shutdown(bot): |
|
127
|
|
|
# Unset `url_exclude` and `last_seen_url`, but not `shortened_urls`; |
|
128
|
|
|
# clearing `shortened_urls` will increase API calls. Leaving it in memory |
|
129
|
|
|
# should not lead to unexpected behavior. |
|
130
|
|
|
for key in ['url_exclude', 'last_seen_url']: |
|
131
|
|
|
try: |
|
132
|
|
|
del bot.memory[key] |
|
133
|
|
|
except KeyError: |
|
134
|
|
|
pass |
|
135
|
|
|
|
|
136
|
|
|
|
|
137
|
|
|
@module.commands('title') |
|
138
|
|
|
@module.example( |
|
139
|
|
|
'.title https://www.google.com', |
|
140
|
|
|
'[ Google ] - www.google.com', |
|
141
|
|
|
online=True) |
|
142
|
|
|
def title_command(bot, trigger): |
|
143
|
|
|
""" |
|
144
|
|
|
Show the title or URL information for the given URL, or the last URL seen |
|
145
|
|
|
in this channel. |
|
146
|
|
|
""" |
|
147
|
|
|
if not trigger.group(2): |
|
148
|
|
|
if trigger.sender not in bot.memory['last_seen_url']: |
|
149
|
|
|
return |
|
150
|
|
|
matched = check_callbacks( |
|
151
|
|
|
bot, bot.memory['last_seen_url'][trigger.sender]) |
|
152
|
|
|
if matched: |
|
153
|
|
|
return |
|
154
|
|
|
else: |
|
155
|
|
|
urls = [bot.memory['last_seen_url'][trigger.sender]] |
|
156
|
|
|
else: |
|
157
|
|
|
urls = web.search_urls( |
|
158
|
|
|
trigger, |
|
159
|
|
|
exclusion_char=bot.config.url.exclusion_char) |
|
160
|
|
|
|
|
161
|
|
|
for url, title, domain, tinyurl in process_urls(bot, trigger, urls): |
|
162
|
|
|
message = '[ %s ] - %s' % (title, domain) |
|
163
|
|
|
if tinyurl: |
|
164
|
|
|
message += ' ( %s )' % tinyurl |
|
165
|
|
|
bot.reply(message) |
|
166
|
|
|
bot.memory['last_seen_url'][trigger.sender] = url |
|
167
|
|
|
|
|
168
|
|
|
|
|
169
|
|
|
@module.rule(r'(?u).*(https?://\S+).*') |
|
170
|
|
|
def title_auto(bot, trigger): |
|
171
|
|
|
""" |
|
172
|
|
|
Automatically show titles for URLs. For shortened URLs/redirects, find |
|
173
|
|
|
where the URL redirects to and show the title for that (or call a function |
|
174
|
|
|
from another module to give more information). |
|
175
|
|
|
""" |
|
176
|
|
|
if re.match(bot.config.core.prefix + 'title', trigger): |
|
177
|
|
|
return |
|
178
|
|
|
|
|
179
|
|
|
# Avoid fetching known malicious links |
|
180
|
|
|
if 'safety_cache' in bot.memory and trigger in bot.memory['safety_cache']: |
|
181
|
|
|
if bot.memory['safety_cache'][trigger]['positives'] > 1: |
|
182
|
|
|
return |
|
183
|
|
|
|
|
184
|
|
|
urls = web.search_urls( |
|
185
|
|
|
trigger, exclusion_char=bot.config.url.exclusion_char, clean=True) |
|
186
|
|
|
|
|
187
|
|
|
for url, title, domain, tinyurl in process_urls(bot, trigger, urls): |
|
188
|
|
|
message = '[ %s ] - %s' % (title, domain) |
|
189
|
|
|
if tinyurl: |
|
190
|
|
|
message += ' ( %s )' % tinyurl |
|
191
|
|
|
# Guard against responding to other instances of this bot. |
|
192
|
|
|
if message != trigger: |
|
193
|
|
|
bot.say(message) |
|
194
|
|
|
bot.memory['last_seen_url'][trigger.sender] = url |
|
195
|
|
|
|
|
196
|
|
|
|
|
197
|
|
|
def process_urls(bot, trigger, urls): |
|
198
|
|
|
""" |
|
199
|
|
|
For each URL in the list, ensure that it isn't handled by another module. |
|
200
|
|
|
If not, find where it redirects to, if anywhere. If that redirected URL |
|
201
|
|
|
should be handled by another module, dispatch the callback for it. |
|
202
|
|
|
Return a list of (title, hostname) tuples for each URL which is not handled |
|
203
|
|
|
by another module. |
|
204
|
|
|
""" |
|
205
|
|
|
shorten_url_length = bot.config.url.shorten_url_length |
|
206
|
|
|
for url in urls: |
|
207
|
|
|
# Exclude URLs that start with the exclusion char |
|
208
|
|
|
if url.startswith(bot.config.url.exclusion_char): |
|
209
|
|
|
continue |
|
210
|
|
|
|
|
211
|
|
|
# Check the URL does not match an existing URL callback |
|
212
|
|
|
if check_callbacks(bot, url): |
|
213
|
|
|
continue |
|
214
|
|
|
|
|
215
|
|
|
# Prevent private addresses from being queried if enable_private_resolution is False |
|
216
|
|
|
if not bot.config.url.enable_private_resolution: |
|
217
|
|
|
parsed = urlparse(url) |
|
218
|
|
|
# Check if it's an address like http://192.168.1.1 |
|
219
|
|
|
try: |
|
220
|
|
|
if ipaddress.ip_address(parsed.hostname).is_private or ipaddress.ip_address(parsed.hostname).is_loopback: |
|
221
|
|
|
continue |
|
222
|
|
|
except ValueError: |
|
223
|
|
|
pass |
|
224
|
|
|
|
|
225
|
|
|
# Check if domains are RFC1918 addresses if enable_dns_resolutions is set |
|
226
|
|
|
if bot.config.url.enable_dns_resolution: |
|
227
|
|
|
private = False |
|
228
|
|
|
for result in dns.resolver.query(parsed.hostname): |
|
229
|
|
|
if ipaddress.ip_address(result).is_private or ipaddress.ip_address(parsed.hostname).is_loopback: |
|
230
|
|
|
private = True |
|
231
|
|
|
break |
|
232
|
|
|
if private: |
|
233
|
|
|
continue |
|
234
|
|
|
|
|
235
|
|
|
# Call the URL to get a title, if possible |
|
236
|
|
|
title = find_title(url, verify=bot.config.core.verify_ssl) |
|
237
|
|
|
if not title: |
|
238
|
|
|
# No title found: don't handle this URL |
|
239
|
|
|
continue |
|
240
|
|
|
|
|
241
|
|
|
# If the URL is over bot.config.url.shorten_url_length, shorten the URL |
|
242
|
|
|
tinyurl = None |
|
243
|
|
|
if (shorten_url_length > 0) and (len(url) > shorten_url_length): |
|
244
|
|
|
tinyurl = get_or_create_shorturl(bot, url) |
|
245
|
|
|
|
|
246
|
|
|
yield (url, title, get_hostname(url), tinyurl) |
|
247
|
|
|
|
|
248
|
|
|
|
|
249
|
|
|
def check_callbacks(bot, url): |
|
250
|
|
|
"""Check if ``url`` is excluded or matches any URL callback patterns. |
|
251
|
|
|
|
|
252
|
|
|
:param bot: Sopel instance |
|
253
|
|
|
:param str url: URL to check |
|
254
|
|
|
:return: True if ``url`` is excluded or matches any URL Callback pattern |
|
255
|
|
|
|
|
256
|
|
|
This function looks at the ``bot.memory`` for ``url_exclude`` patterns and |
|
257
|
|
|
it returns ``True`` if any matches the given ``url``. Otherwise, it looks |
|
258
|
|
|
at the ``bot``'s URL Callback patterns, and it returns ``True`` if any |
|
259
|
|
|
matches, ``False`` otherwise. |
|
260
|
|
|
|
|
261
|
|
|
.. seealso:: |
|
262
|
|
|
|
|
263
|
|
|
The :func:`~sopel.modules.url.setup` function that defines the |
|
264
|
|
|
``url_exclude`` in ``bot.memory``. |
|
265
|
|
|
|
|
266
|
|
|
.. versionchanged:: 7.0 |
|
267
|
|
|
|
|
268
|
|
|
This function **does not** trigger URL callbacks anymore when ``url`` |
|
269
|
|
|
matches a pattern. |
|
270
|
|
|
|
|
271
|
|
|
""" |
|
272
|
|
|
# Check if it matches the exclusion list first |
|
273
|
|
|
matched = any(regex.search(url) for regex in bot.memory['url_exclude']) |
|
274
|
|
|
return matched or any(bot.search_url_callbacks(url)) |
|
275
|
|
|
|
|
276
|
|
|
|
|
277
|
|
|
def find_title(url, verify=True): |
|
278
|
|
|
"""Return the title for the given URL.""" |
|
279
|
|
|
try: |
|
280
|
|
|
response = requests.get(url, stream=True, verify=verify, |
|
281
|
|
|
headers=default_headers) |
|
282
|
|
|
content = b'' |
|
283
|
|
|
for byte in response.iter_content(chunk_size=512): |
|
284
|
|
|
content += byte |
|
285
|
|
|
if b'</title>' in content or len(content) > max_bytes: |
|
286
|
|
|
break |
|
287
|
|
|
content = content.decode('utf-8', errors='ignore') |
|
288
|
|
|
# Need to close the connection because we have not read all |
|
289
|
|
|
# the data |
|
290
|
|
|
response.close() |
|
291
|
|
|
except requests.exceptions.ConnectionError: |
|
292
|
|
|
return None |
|
293
|
|
|
|
|
294
|
|
|
# Some cleanup that I don't really grok, but was in the original, so |
|
295
|
|
|
# we'll keep it (with the compiled regexes made global) for now. |
|
296
|
|
|
content = title_tag_data.sub(r'<\1title>', content) |
|
297
|
|
|
content = quoted_title.sub('', content) |
|
298
|
|
|
|
|
299
|
|
|
start = content.rfind('<title>') |
|
300
|
|
|
end = content.rfind('</title>') |
|
301
|
|
|
if start == -1 or end == -1: |
|
302
|
|
|
return |
|
303
|
|
|
title = web.decode(content[start + 7:end]) |
|
304
|
|
|
title = title.strip()[:200] |
|
305
|
|
|
|
|
306
|
|
|
title = ' '.join(title.split()) # cleanly remove multiple spaces |
|
307
|
|
|
|
|
308
|
|
|
# More cryptic regex substitutions. This one looks to be myano's invention. |
|
309
|
|
|
title = re_dcc.sub('', title) |
|
310
|
|
|
|
|
311
|
|
|
return title or None |
|
312
|
|
|
|
|
313
|
|
|
|
|
314
|
|
|
def get_hostname(url): |
|
315
|
|
|
idx = 7 |
|
316
|
|
|
if url.startswith('https://'): |
|
317
|
|
|
idx = 8 |
|
318
|
|
|
elif url.startswith('ftp://'): |
|
319
|
|
|
idx = 6 |
|
320
|
|
|
hostname = url[idx:] |
|
321
|
|
|
slash = hostname.find('/') |
|
322
|
|
|
if slash != -1: |
|
323
|
|
|
hostname = hostname[:slash] |
|
324
|
|
|
return hostname |
|
325
|
|
|
|
|
326
|
|
|
|
|
327
|
|
|
def get_or_create_shorturl(bot, url): |
|
328
|
|
|
"""Get or create a short URL for ``url`` |
|
329
|
|
|
|
|
330
|
|
|
:param bot: Sopel instance |
|
331
|
|
|
:param str url: URL to get or create a short URL for |
|
332
|
|
|
:return: A short URL |
|
333
|
|
|
:rtype: str |
|
334
|
|
|
|
|
335
|
|
|
It gets the short URL for ``url`` from the bot's memory if it exists. |
|
336
|
|
|
Otherwise, it creates a short URL (see :func:`get_tinyurl`), stores it |
|
337
|
|
|
into the bot's memory, then returns it. |
|
338
|
|
|
""" |
|
339
|
|
|
# Check bot memory to see if the shortened URL is already in |
|
340
|
|
|
# memory |
|
341
|
|
|
if url in bot.memory['shortened_urls']: |
|
342
|
|
|
return bot.memory['shortened_urls'][url] |
|
343
|
|
|
|
|
344
|
|
|
tinyurl = get_tinyurl(url) |
|
345
|
|
|
bot.memory['shortened_urls'][url] = tinyurl |
|
346
|
|
|
return tinyurl |
|
347
|
|
|
|
|
348
|
|
|
|
|
349
|
|
|
def get_tinyurl(url): |
|
350
|
|
|
"""Returns a shortened tinyURL link of the URL""" |
|
351
|
|
|
base_url = "https://tinyurl.com/api-create.php" |
|
352
|
|
|
tinyurl = "%s?%s" % (base_url, web.urlencode({'url': url})) |
|
353
|
|
|
try: |
|
354
|
|
|
res = requests.get(tinyurl) |
|
355
|
|
|
res.raise_for_status() |
|
356
|
|
|
except requests.exceptions.RequestException: |
|
357
|
|
|
return None |
|
358
|
|
|
# Replace text output with https instead of http to make the |
|
359
|
|
|
# result an HTTPS link. |
|
360
|
|
|
return res.text.replace("http://", "https://") |
|
361
|
|
|
|
|
362
|
|
|
|
|
363
|
|
|
if __name__ == "__main__": |
|
364
|
|
|
from sopel.test_tools import run_example_tests |
|
365
|
|
|
run_example_tests(__file__) |
|
366
|
|
|
|