1
|
|
|
import logging |
|
|
|
|
2
|
|
|
import random |
3
|
|
|
import time |
4
|
|
|
from collections.abc import ByteString, Callable, Mapping |
5
|
|
|
from dataclasses import dataclass |
6
|
|
|
from datetime import timedelta |
7
|
|
|
from typing import Any |
8
|
|
|
from urllib import request |
9
|
|
|
|
10
|
|
|
logger = logging.getLogger("pocketutils") |
11
|
|
|
|
12
|
|
|
|
13
|
|
|
def download_urllib(req: request.Request) -> bytes: |
|
|
|
|
14
|
|
|
with request.urlopen(req) as q: |
|
|
|
|
15
|
|
|
return q.read() |
16
|
|
|
|
17
|
|
|
|
18
|
|
|
@dataclass(frozen=True, repr=True, order=True) |
|
|
|
|
19
|
|
|
class TimeTaken: |
20
|
|
|
query: timedelta |
21
|
|
|
wait: timedelta |
22
|
|
|
|
23
|
|
|
|
24
|
|
|
class QueryExecutor: |
25
|
|
|
""" |
26
|
|
|
A synchronous GET/POST query executor that limits the rate of requests. |
27
|
|
|
""" |
28
|
|
|
|
29
|
|
|
def __init__( |
30
|
|
|
self, |
|
|
|
|
31
|
|
|
sec_delay_min: float = 0.25, |
|
|
|
|
32
|
|
|
sec_delay_max: float = 0.25, |
|
|
|
|
33
|
|
|
encoding: str | None = "utf-8", |
|
|
|
|
34
|
|
|
querier: Callable[[request.Request], ByteString] | None = None, |
|
|
|
|
35
|
|
|
): |
36
|
|
|
self._min = sec_delay_min |
37
|
|
|
self._max = sec_delay_max |
38
|
|
|
self._rand = random.Random() # nosec |
39
|
|
|
self._encoding = encoding |
40
|
|
|
self._next_at = 0 |
41
|
|
|
self._querier = download_urllib if querier is None else querier |
42
|
|
|
self._time_taken = None |
43
|
|
|
|
44
|
|
|
@property |
45
|
|
|
def last_time_taken(self) -> TimeTaken: |
|
|
|
|
46
|
|
|
return self._time_taken |
47
|
|
|
|
48
|
|
|
def __call__( |
|
|
|
|
49
|
|
|
self, |
|
|
|
|
50
|
|
|
url: str, |
|
|
|
|
51
|
|
|
method: str = "get", |
|
|
|
|
52
|
|
|
encoding: str | None = "-1", |
|
|
|
|
53
|
|
|
headers: Mapping[str, str] | None = None, |
|
|
|
|
54
|
|
|
errors: str = "ignore", |
|
|
|
|
55
|
|
|
) -> str: |
56
|
|
|
headers = {} if headers is None else headers |
57
|
|
|
encoding = self._encoding if encoding == "-1" else encoding |
58
|
|
|
now = time.monotonic() |
59
|
|
|
wait_secs = self._next_at - now |
60
|
|
|
if now < self._next_at: |
61
|
|
|
time.sleep(wait_secs) |
62
|
|
|
now = time.monotonic() |
63
|
|
|
req = request.Request(url=url, method=method, headers=headers) |
64
|
|
|
content = self._querier(req) |
65
|
|
|
if encoding is None: |
66
|
|
|
data = content.decode(errors=errors) |
67
|
|
|
else: |
68
|
|
|
data = content.decode(encoding=encoding, errors=errors) |
69
|
|
|
now_ = time.monotonic() |
70
|
|
|
self._time_taken = TimeTaken(timedelta(seconds=wait_secs), timedelta(seconds=now_ - now)) |
71
|
|
|
self._next_at = now_ + self._rand.uniform(self._min, self._max) |
72
|
|
|
return data |
73
|
|
|
|
74
|
|
|
|
75
|
|
|
class QueryMixin: |
|
|
|
|
76
|
|
|
@property |
77
|
|
|
def executor(self) -> QueryExecutor: |
|
|
|
|
78
|
|
|
raise NotImplementedError() |
79
|
|
|
|
80
|
|
|
def _query(self, url: str, *, sink: Callable[[str], Any] = logger.debug) -> str: |
81
|
|
|
data = self.executor(url) |
82
|
|
|
tt = self.executor.last_time_taken |
|
|
|
|
83
|
|
|
wt, qt = tt.wait.total_seconds(), tt.query.total_seconds() |
|
|
|
|
84
|
|
|
bts = int(len(data) * 8 / 1024) |
85
|
|
|
sink(f"Queried {bts} kb from {url} in {qt:.1} s with {wt:.1} s of wait") |
|
|
|
|
86
|
|
|
return data |
87
|
|
|
|
88
|
|
|
|
89
|
|
|
__all__ = ["QueryExecutor", "TimeTaken", "QueryMixin"] |
90
|
|
|
|