|
1
|
|
|
import logging |
|
|
|
|
|
|
2
|
|
|
import random |
|
3
|
|
|
import time |
|
4
|
|
|
from collections.abc import ByteString, Callable, Mapping |
|
5
|
|
|
from dataclasses import dataclass |
|
6
|
|
|
from datetime import timedelta |
|
7
|
|
|
from typing import Any |
|
8
|
|
|
from urllib import request |
|
9
|
|
|
|
|
10
|
|
|
logger = logging.getLogger("pocketutils") |
|
11
|
|
|
|
|
12
|
|
|
|
|
13
|
|
|
def download_urllib(req: request.Request) -> bytes: |
|
|
|
|
|
|
14
|
|
|
with request.urlopen(req) as q: |
|
|
|
|
|
|
15
|
|
|
return q.read() |
|
16
|
|
|
|
|
17
|
|
|
|
|
18
|
|
|
@dataclass(frozen=True, repr=True, order=True) |
|
|
|
|
|
|
19
|
|
|
class TimeTaken: |
|
20
|
|
|
query: timedelta |
|
21
|
|
|
wait: timedelta |
|
22
|
|
|
|
|
23
|
|
|
|
|
24
|
|
|
class QueryExecutor: |
|
25
|
|
|
""" |
|
26
|
|
|
A synchronous GET/POST query executor that limits the rate of requests. |
|
27
|
|
|
""" |
|
28
|
|
|
|
|
29
|
|
|
def __init__( |
|
30
|
|
|
self, |
|
|
|
|
|
|
31
|
|
|
sec_delay_min: float = 0.25, |
|
|
|
|
|
|
32
|
|
|
sec_delay_max: float = 0.25, |
|
|
|
|
|
|
33
|
|
|
encoding: str | None = "utf-8", |
|
|
|
|
|
|
34
|
|
|
querier: Callable[[request.Request], ByteString] | None = None, |
|
|
|
|
|
|
35
|
|
|
): |
|
36
|
|
|
self._min = sec_delay_min |
|
37
|
|
|
self._max = sec_delay_max |
|
38
|
|
|
self._rand = random.Random() # nosec |
|
39
|
|
|
self._encoding = encoding |
|
40
|
|
|
self._next_at = 0 |
|
41
|
|
|
self._querier = download_urllib if querier is None else querier |
|
42
|
|
|
self._time_taken = None |
|
43
|
|
|
|
|
44
|
|
|
@property |
|
45
|
|
|
def last_time_taken(self) -> TimeTaken: |
|
|
|
|
|
|
46
|
|
|
return self._time_taken |
|
47
|
|
|
|
|
48
|
|
|
def __call__( |
|
|
|
|
|
|
49
|
|
|
self, |
|
|
|
|
|
|
50
|
|
|
url: str, |
|
|
|
|
|
|
51
|
|
|
method: str = "get", |
|
|
|
|
|
|
52
|
|
|
encoding: str | None = "-1", |
|
|
|
|
|
|
53
|
|
|
headers: Mapping[str, str] | None = None, |
|
|
|
|
|
|
54
|
|
|
errors: str = "ignore", |
|
|
|
|
|
|
55
|
|
|
) -> str: |
|
56
|
|
|
headers = {} if headers is None else headers |
|
57
|
|
|
encoding = self._encoding if encoding == "-1" else encoding |
|
58
|
|
|
now = time.monotonic() |
|
59
|
|
|
wait_secs = self._next_at - now |
|
60
|
|
|
if now < self._next_at: |
|
61
|
|
|
time.sleep(wait_secs) |
|
62
|
|
|
now = time.monotonic() |
|
63
|
|
|
req = request.Request(url=url, method=method, headers=headers) |
|
64
|
|
|
content = self._querier(req) |
|
65
|
|
|
if encoding is None: |
|
66
|
|
|
data = content.decode(errors=errors) |
|
67
|
|
|
else: |
|
68
|
|
|
data = content.decode(encoding=encoding, errors=errors) |
|
69
|
|
|
now_ = time.monotonic() |
|
70
|
|
|
self._time_taken = TimeTaken(timedelta(seconds=wait_secs), timedelta(seconds=now_ - now)) |
|
71
|
|
|
self._next_at = now_ + self._rand.uniform(self._min, self._max) |
|
72
|
|
|
return data |
|
73
|
|
|
|
|
74
|
|
|
|
|
75
|
|
|
class QueryMixin: |
|
|
|
|
|
|
76
|
|
|
@property |
|
77
|
|
|
def executor(self) -> QueryExecutor: |
|
|
|
|
|
|
78
|
|
|
raise NotImplementedError() |
|
79
|
|
|
|
|
80
|
|
|
def _query(self, url: str, *, sink: Callable[[str], Any] = logger.debug) -> str: |
|
81
|
|
|
data = self.executor(url) |
|
82
|
|
|
tt = self.executor.last_time_taken |
|
|
|
|
|
|
83
|
|
|
wt, qt = tt.wait.total_seconds(), tt.query.total_seconds() |
|
|
|
|
|
|
84
|
|
|
bts = int(len(data) * 8 / 1024) |
|
85
|
|
|
sink(f"Queried {bts} kb from {url} in {qt:.1} s with {wt:.1} s of wait") |
|
|
|
|
|
|
86
|
|
|
return data |
|
87
|
|
|
|
|
88
|
|
|
|
|
89
|
|
|
__all__ = ["QueryExecutor", "TimeTaken", "QueryMixin"] |
|
90
|
|
|
|