Completed
Push — master ( 1cc082...22211d )
by Roy
01:17
created

ProjectManager.__init__()   A

Complexity

Conditions 1

Size

Total Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
dl 0
loc 6
rs 9.4285
c 0
b 0
f 0
1
#!/usr/bin/env python
2
# -*- encoding: utf-8 -*-
3
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
4
# Author: Binux<[email protected]>
5
#         http://binux.me
6
# Created on 2014-02-16 22:24:20
7
8
import os
9
import six
10
import sys
11
import imp
12
import time
13
import weakref
14
import logging
15
import inspect
16
import traceback
17
import linecache
18
from pyspider.libs import utils
19
from pyspider.libs.log import SaveLogHandler, LogFormatter
20
logger = logging.getLogger("processor")
21
22
23
class ProjectManager(object):
24
    """
25
    load projects from projectdb, update project
26
    """
27
28
    CHECK_PROJECTS_INTERVAL = 5 * 60
29
    RELOAD_PROJECT_INTERVAL = 60 * 60
30
31
    @staticmethod
32
    def build_module(project, env=None):
33
        '''Build project script as module'''
34
        from pyspider.libs import base_handler
35
        assert 'name' in project, 'need name of project'
36
        assert 'script' in project, 'need script of project'
37
38
        if env is None:
39
            env = {}
40
        # fix for old non-package version scripts
41
        pyspider_path = os.path.join(os.path.dirname(__file__), "..")
42
        if pyspider_path not in sys.path:
43
            sys.path.insert(1, pyspider_path)
44
45
        env = dict(env)
46
        env.update({
47
            'debug': project.get('status', 'DEBUG') == 'DEBUG',
48
        })
49
50
        loader = ProjectLoader(project)
51
        module = loader.load_module(project['name'])
52
53
        # logger inject
54
        module.log_buffer = []
55
        module.logging = module.logger = logging.Logger(project['name'])
56
        if env.get('enable_stdout_capture', True):
57
            handler = SaveLogHandler(module.log_buffer)
58
            handler.setFormatter(LogFormatter(color=False))
59
        else:
60
            handler = logging.StreamHandler()
61
            handler.setFormatter(LogFormatter(color=True))
62
        module.logger.addHandler(handler)
63
64
        if '__handler_cls__' not in module.__dict__:
65
            BaseHandler = module.__dict__.get('BaseHandler', base_handler.BaseHandler)
66
            for each in list(six.itervalues(module.__dict__)):
67
                if inspect.isclass(each) and each is not BaseHandler \
68
                        and issubclass(each, BaseHandler):
69
                    module.__dict__['__handler_cls__'] = each
70
        _class = module.__dict__.get('__handler_cls__')
71
        assert _class is not None, "need BaseHandler in project module"
72
73
        instance = _class()
74
        instance.__env__ = env
75
        instance.project_name = project['name']
76
        instance.project = project
77
78
        return {
79
            'loader': loader,
80
            'module': module,
81
            'class': _class,
82
            'instance': instance,
83
            'exception': None,
84
            'exception_log': '',
85
            'info': project,
86
            'load_time': time.time(),
87
        }
88
89
    def __init__(self, projectdb, env):
90
        self.projectdb = projectdb
91
        self.env = env
92
93
        self.projects = {}
94
        self.last_check_projects = time.time()
95
96
    def _need_update(self, project_name, updatetime=None, md5sum=None):
97
        '''Check if project_name need update'''
98
        if project_name not in self.projects:
99
            return True
100
        elif md5sum and md5sum != self.projects[project_name]['info'].get('md5sum'):
101
            return True
102
        elif updatetime and updatetime > self.projects[project_name]['info'].get('updatetime', 0):
103
            return True
104
        elif time.time() - self.projects[project_name]['load_time'] > self.RELOAD_PROJECT_INTERVAL:
105
            return True
106
        return False
107
108
    def _check_projects(self):
109
        '''Check projects by last update time'''
110
        for project in self.projectdb.check_update(self.last_check_projects,
111
                                                   ['name', 'updatetime']):
112
            if project['name'] not in self.projects:
113
                continue
114
            if project['updatetime'] > self.projects[project['name']]['info'].get('updatetime', 0):
115
                self._update_project(project['name'])
116
        self.last_check_projects = time.time()
117
118
    def _update_project(self, project_name):
119
        '''Update one project from database'''
120
        project = self.projectdb.get(project_name)
121
        if not project:
122
            return None
123
        return self._load_project(project)
124
125
    def _load_project(self, project):
126
        '''Load project into self.projects from project info dict'''
127
        try:
128
            project['md5sum'] = utils.md5string(project['script'])
129
            ret = self.build_module(project, self.env)
130
            self.projects[project['name']] = ret
131
        except Exception as e:
132
            logger.exception("load project %s error", project.get('name', None))
133
            ret = {
134
                'loader': None,
135
                'module': None,
136
                'class': None,
137
                'instance': None,
138
                'exception': e,
139
                'exception_log': traceback.format_exc(),
140
                'info': project,
141
                'load_time': time.time(),
142
            }
143
            self.projects[project['name']] = ret
144
            return False
145
        logger.debug('project: %s updated.', project.get('name', None))
146
        return True
147
148
    def get(self, project_name, updatetime=None, md5sum=None):
149
        '''get project data object, return None if not exists'''
150
        if time.time() - self.last_check_projects > self.CHECK_PROJECTS_INTERVAL:
151
            self._check_projects()
152
        if self._need_update(project_name, updatetime, md5sum):
153
            self._update_project(project_name)
154
        return self.projects.get(project_name, None)
155
156
157
class ProjectFinder(object):
158
    '''ProjectFinder class for sys.meta_path'''
159
160
    def __init__(self, projectdb):
161
        self.get_projectdb = weakref.ref(projectdb)
162
163
    @property
164
    def projectdb(self):
165
        return self.get_projectdb()
166
167
    def find_module(self, fullname, path=None):
168
        if fullname == 'projects':
169
            return self
170
        parts = fullname.split('.')
171
        if len(parts) == 2 and parts[0] == 'projects':
172
            name = parts[1]
173
            if not self.projectdb:
174
                return
175
            info = self.projectdb.get(name)
176
            if info:
177
                return ProjectLoader(info)
178
179
    def load_module(self, fullname):
180
        mod = imp.new_module(fullname)
181
        mod.__file__ = '<projects>'
182
        mod.__loader__ = self
183
        mod.__path__ = ['<projects>']
184
        mod.__package__ = 'projects'
185
        return mod
186
187
    def is_package(self, fullname):
188
        return True
189
190
191
class ProjectLoader(object):
192
    '''ProjectLoader class for sys.meta_path'''
193
194
    def __init__(self, project, mod=None):
195
        self.project = project
196
        self.name = project['name']
197
        self.mod = mod
198
199
    def load_module(self, fullname):
200
        if self.mod is None:
201
            self.mod = mod = imp.new_module(fullname)
202
        else:
203
            mod = self.mod
204
        mod.__file__ = '<%s>' % self.name
205
        mod.__loader__ = self
206
        mod.__project__ = self.project
207
        mod.__package__ = ''
208
        code = self.get_code(fullname)
209
        six.exec_(code, mod.__dict__)
210
        linecache.clearcache()
211
        return mod
212
213
    def is_package(self, fullname):
214
        return False
215
216
    def get_code(self, fullname):
217
        return compile(self.get_source(fullname), '<%s>' % self.name, 'exec')
218
219
    def get_source(self, fullname):
220
        script = self.project['script']
221
        if isinstance(script, six.text_type):
222
            return script.encode('utf8')
223
        return script
224