Completed
Push — master ( edf772...9bb3dc )
by Roy
01:07
created

ProjectLoader.module_repr()   A

Complexity

Conditions 1

Size

Total Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 1 Features 0
Metric Value
cc 1
dl 0
loc 2
rs 10
c 1
b 1
f 0
1
#!/usr/bin/env python
2
# -*- encoding: utf-8 -*-
3
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
4
# Author: Binux<[email protected]>
5
#         http://binux.me
6
# Created on 2014-02-16 22:24:20
7
8
import os
9
import six
10
import sys
11
import imp
12
import time
13
import weakref
14
import logging
15
import inspect
16
import traceback
17
import linecache
18
from pyspider.libs import utils
19
from pyspider.libs.log import SaveLogHandler, LogFormatter
20
logger = logging.getLogger("processor")
21
22
23
class ProjectManager(object):
24
    """
25
    load projects from projectdb, update project
26
    """
27
28
    CHECK_PROJECTS_INTERVAL = 5 * 60
29
    RELOAD_PROJECT_INTERVAL = 60 * 60
30
31
    @staticmethod
32
    def build_module(project, env=None):
33
        '''Build project script as module'''
34
        from pyspider.libs import base_handler
35
        assert 'name' in project, 'need name of project'
36
        assert 'script' in project, 'need script of project'
37
38
        if env is None:
39
            env = {}
40
        # fix for old non-package version scripts
41
        pyspider_path = os.path.join(os.path.dirname(__file__), "..")
42
        if pyspider_path not in sys.path:
43
            sys.path.insert(1, pyspider_path)
44
45
        env = dict(env)
46
        env.update({
47
            'debug': project.get('status', 'DEBUG') == 'DEBUG',
48
        })
49
50
        loader = ProjectLoader(project)
51
        module = loader.load_module(project['name'])
52
53
        # logger inject
54
        module.log_buffer = []
55
        module.logging = module.logger = logging.Logger(project['name'])
56
        if env.get('enable_stdout_capture', True):
57
            handler = SaveLogHandler(module.log_buffer)
58
            handler.setFormatter(LogFormatter(color=False))
59
        else:
60
            handler = logging.StreamHandler()
61
            handler.setFormatter(LogFormatter(color=True))
62
        module.logger.addHandler(handler)
63
64
        if '__handler_cls__' not in module.__dict__:
65
            BaseHandler = module.__dict__.get('BaseHandler', base_handler.BaseHandler)
66
            for each in list(six.itervalues(module.__dict__)):
67
                if inspect.isclass(each) and each is not BaseHandler \
68
                        and issubclass(each, BaseHandler):
69
                    module.__dict__['__handler_cls__'] = each
70
        _class = module.__dict__.get('__handler_cls__')
71
        assert _class is not None, "need BaseHandler in project module"
72
73
        instance = _class()
74
        instance.__env__ = env
75
        instance.project_name = project['name']
76
        instance.project = project
77
78
        return {
79
            'loader': loader,
80
            'module': module,
81
            'class': _class,
82
            'instance': instance,
83
            'exception': None,
84
            'exception_log': '',
85
            'info': project,
86
            'load_time': time.time(),
87
        }
88
89
    def __init__(self, projectdb, env):
90
        self.projectdb = projectdb
91
        self.env = env
92
93
        self.projects = {}
94
        self.last_check_projects = time.time()
95
96
    def _need_update(self, project_name, updatetime=None, md5sum=None):
97
        '''Check if project_name need update'''
98
        if project_name not in self.projects:
99
            return True
100
        elif md5sum and md5sum != self.projects[project_name]['info'].get('md5sum'):
101
            return True
102
        elif updatetime and updatetime > self.projects[project_name]['info'].get('updatetime', 0):
103
            return True
104
        elif time.time() - self.projects[project_name]['load_time'] > self.RELOAD_PROJECT_INTERVAL:
105
            return True
106
        return False
107
108
    def _check_projects(self):
109
        '''Check projects by last update time'''
110
        for project in self.projectdb.check_update(self.last_check_projects,
111
                                                   ['name', 'updatetime']):
112
            if project['name'] not in self.projects:
113
                continue
114
            if project['updatetime'] > self.projects[project['name']]['info'].get('updatetime', 0):
115
                self._update_project(project['name'])
116
        self.last_check_projects = time.time()
117
118
    def _update_project(self, project_name):
119
        '''Update one project from database'''
120
        project = self.projectdb.get(project_name)
121
        if not project:
122
            return None
123
        return self._load_project(project)
124
125
    def _load_project(self, project):
126
        '''Load project into self.projects from project info dict'''
127
        try:
128
            project['md5sum'] = utils.md5string(project['script'])
129
            ret = self.build_module(project, self.env)
130
            self.projects[project['name']] = ret
131
        except Exception as e:
132
            logger.exception("load project %s error", project.get('name', None))
133
            ret = {
134
                'loader': None,
135
                'module': None,
136
                'class': None,
137
                'instance': None,
138
                'exception': e,
139
                'exception_log': traceback.format_exc(),
140
                'info': project,
141
                'load_time': time.time(),
142
            }
143
            self.projects[project['name']] = ret
144
            return False
145
        logger.debug('project: %s updated.', project.get('name', None))
146
        return True
147
148
    def get(self, project_name, updatetime=None, md5sum=None):
149
        '''get project data object, return None if not exists'''
150
        if time.time() - self.last_check_projects > self.CHECK_PROJECTS_INTERVAL:
151
            self._check_projects()
152
        if self._need_update(project_name, updatetime, md5sum):
153
            self._update_project(project_name)
154
        return self.projects.get(project_name, None)
155
156
157
class ProjectLoader(object):
158
    '''ProjectLoader class for sys.meta_path'''
159
160
    def __init__(self, project, mod=None):
161
        self.project = project
162
        self.name = project['name']
163
        self.mod = mod
164
        pass
165
166
    def load_module(self, fullname):
167
        if self.mod is None:
168
            self.mod = mod = imp.new_module(fullname)
169
        else:
170
            mod = self.mod
171
        mod.__file__ = '<%s>' % self.name
172
        mod.__loader__ = self
173
        mod.__project__ = self.project
174
        mod.__package__ = ''
175
        code = self.get_code(fullname)
176
        six.exec_(code, mod.__dict__)
177
        linecache.clearcache()
178
        return mod
179
180
    def is_package(self, fullname):
181
        return False
182
183
    def get_code(self, fullname):
184
        return compile(self.get_source(fullname), '<%s>' % self.name, 'exec')
185
186
    def get_source(self, fullname):
187
        script = self.project['script']
188
        if isinstance(script, six.text_type):
189
            return script.encode('utf8')
190
        return script
191
192
193
if six.PY2:
194
    class ProjectFinder(object):
195
        '''ProjectFinder class for sys.meta_path'''
196
197
        def __init__(self, projectdb):
198
            self.get_projectdb = weakref.ref(projectdb)
199
200
        @property
201
        def projectdb(self):
202
            return self.get_projectdb()
203
204
        def find_module(self, fullname, path=None):
205
            if fullname == 'projects':
206
                return self
207
            parts = fullname.split('.')
208
            if len(parts) == 2 and parts[0] == 'projects':
209
                name = parts[1]
210
                if not self.projectdb:
211
                    return
212
                info = self.projectdb.get(name)
213
                if info:
214
                    return ProjectLoader(info)
215
216
        def load_module(self, fullname):
217
            mod = imp.new_module(fullname)
218
            mod.__file__ = '<projects>'
219
            mod.__loader__ = self
220
            mod.__path__ = ['<projects>']
221
            mod.__package__ = 'projects'
222
            return mod
223
224
        def is_package(self, fullname):
225
            return True
226
else:
227
    import importlib.abc
228
229
    class ProjectFinder(importlib.abc.MetaPathFinder):
230
        '''ProjectFinder class for sys.meta_path'''
231
232
        def __init__(self, projectdb):
233
            self.get_projectdb = weakref.ref(projectdb)
234
235
        @property
236
        def projectdb(self):
237
            return self.get_projectdb()
238
239
        def find_spec(self, fullname, path, target=None):
240
            loader = self.find_module(fullname, path)
241
            if loader:
242
                return importlib.util.spec_from_loader(fullname, loader)
243
244
        def find_module(self, fullname, path):
245
            if fullname == 'projects':
246
                return ProjectsLoader()
247
            parts = fullname.split('.')
248
            if len(parts) == 2 and parts[0] == 'projects':
249
                name = parts[1]
250
                if not self.projectdb:
251
                    return
252
                info = self.projectdb.get(name)
253
                if info:
254
                    return ProjectLoader(info)
255
256
    class ProjectsLoader(importlib.abc.InspectLoader):
257
        def is_package(self, fullname):
258
            return True
259
260
        def get_source(self, path):
261
            return ''
262
263
    class ProjectLoader(ProjectLoader, importlib.abc.Loader):
264
        def create_module(self, spec):
265
            return self.load_module(spec.name)
266
267
        def exec_module(self, module):
268
            return module
269
270
        def module_repr(self, module):
271
            return '<Module projects.%s>' % self.name
272