1 | # Copyright (c) 2014, Salesforce.com, Inc. All rights reserved. |
||
2 | # |
||
3 | # Redistribution and use in source and binary forms, with or without |
||
4 | # modification, are permitted provided that the following conditions |
||
5 | # are met: |
||
6 | # |
||
7 | # - Redistributions of source code must retain the above copyright |
||
8 | # notice, this list of conditions and the following disclaimer. |
||
9 | # - Redistributions in binary form must reproduce the above copyright |
||
10 | # notice, this list of conditions and the following disclaimer in the |
||
11 | # documentation and/or other materials provided with the distribution. |
||
12 | # - Neither the name of Salesforce.com nor the names of its contributors |
||
13 | # may be used to endorse or promote products derived from this |
||
14 | # software without specific prior written permission. |
||
15 | # |
||
16 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
||
17 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
||
18 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
||
19 | # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE |
||
20 | # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
||
21 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, |
||
22 | # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS |
||
23 | # OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
||
24 | # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR |
||
25 | # TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE |
||
26 | # USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
||
27 | |||
28 | import os |
||
29 | import bz2 |
||
30 | import gzip |
||
31 | import simplejson |
||
32 | import struct |
||
33 | |||
34 | |||
35 | def mkdir_p(dirname): |
||
36 | 'like mkdir -p' |
||
37 | if not os.path.exists(dirname): |
||
38 | try: |
||
39 | os.makedirs(dirname) |
||
40 | except OSError as e: |
||
41 | if not os.path.exists(dirname): |
||
42 | raise e |
||
43 | |||
44 | |||
45 | def open_compressed(filename, mode='r'): |
||
46 | if 'w' in mode: |
||
47 | dirname = os.path.dirname(filename) |
||
48 | if dirname: |
||
49 | mkdir_p(dirname) |
||
50 | if filename.endswith('.bz2'): |
||
51 | return bz2.BZ2File(filename, mode.replace('b', '')) |
||
52 | elif filename.endswith('.gz'): |
||
53 | return gzip.GzipFile(filename, mode) |
||
54 | else: |
||
55 | return file(filename, mode) |
||
56 | |||
57 | |||
58 | def json_dump(data, filename, **kwargs): |
||
59 | with open_compressed(filename, 'w') as f: |
||
60 | simplejson.dump(data, f, **kwargs) |
||
61 | |||
62 | |||
63 | def json_load(filename): |
||
64 | with open_compressed(filename, 'rb') as f: |
||
65 | return simplejson.load(f) |
||
66 | |||
67 | |||
68 | View Code Duplication | def json_stream_dump(stream, filename, **kwargs): |
|
0 ignored issues
–
show
Duplication
introduced
by
Loading history...
|
|||
69 | kwargs['separators'] = (',', ':') |
||
70 | stream = iter(stream) |
||
71 | with open_compressed(filename, 'w') as f: |
||
72 | f.write('[') |
||
73 | try: |
||
74 | item = next(stream) |
||
75 | f.write('\n') |
||
76 | simplejson.dump(item, f, **kwargs) |
||
77 | for item in stream: |
||
78 | f.write(',\n') |
||
79 | simplejson.dump(item, f, **kwargs) |
||
80 | except StopIteration: |
||
81 | pass |
||
82 | f.write('\n]') |
||
83 | |||
84 | |||
85 | View Code Duplication | def json_costream_dump(filename, **kwargs): |
|
0 ignored issues
–
show
|
|||
86 | kwargs['separators'] = (',', ':') |
||
87 | with open_compressed(filename, 'w') as f: |
||
88 | f.write('[') |
||
89 | try: |
||
90 | item = (yield) |
||
91 | f.write('\n') |
||
92 | simplejson.dump(item, f, **kwargs) |
||
93 | while True: |
||
94 | item = (yield) |
||
95 | f.write(',\n') |
||
96 | simplejson.dump(item, f, **kwargs) |
||
97 | except GeneratorExit: |
||
98 | pass |
||
99 | f.write('\n]') |
||
100 | |||
101 | |||
102 | class json_stream_load(object): |
||
103 | ''' |
||
104 | Read json data that was created by json_stream_dump or json_costream_dump. |
||
105 | |||
106 | Note that this exploits newline formatting in the above dumpers. |
||
107 | In particular: |
||
108 | - the first line is '[' |
||
109 | - intermediate lines are of the form '{},'.format(json_parsable_content) |
||
110 | - the penultimate line is of the form '{}'.format(json_parsable_content) |
||
111 | - the last line is ']' |
||
112 | - there is no trailing whitespace |
||
113 | |||
114 | An alternative would be to use ijson to streamingly load arbitrary json |
||
115 | files, however in practice this is ~40x slower. |
||
116 | ''' |
||
117 | def __init__(self, filename): |
||
118 | self.fd = open_compressed(filename, 'rb') |
||
119 | line = self.fd.readline(2) |
||
120 | if line != '[\n': |
||
121 | raise IOError( |
||
122 | 'Unhandled format for json_stream_load. ' |
||
123 | 'Try recreating json file with the compatible ' |
||
124 | 'json_stream_dump or json_costream_dump.') |
||
125 | |||
126 | def __iter__(self): |
||
127 | return self |
||
128 | |||
129 | def next(self): |
||
130 | line = self.fd.readline().rstrip(',\n') |
||
131 | if line == ']': |
||
132 | self.close() |
||
133 | raise StopIteration |
||
134 | else: |
||
135 | return simplejson.loads(line) |
||
136 | |||
137 | def close(self): |
||
138 | self.fd.close() |
||
139 | |||
140 | |||
141 | def protobuf_stream_write(item, fd): |
||
142 | assert isinstance(item, str), item |
||
143 | fd.write(struct.pack('<I', len(item))) |
||
144 | fd.write(item) |
||
145 | |||
146 | |||
147 | def protobuf_stream_read(fd): |
||
148 | size_str = fd.read(4) |
||
149 | if len(size_str) < 4: |
||
150 | raise StopIteration |
||
151 | size = struct.unpack('<I', size_str)[0] |
||
152 | return fd.read(size) |
||
153 | |||
154 | |||
155 | def protobuf_stream_dump(stream, filename): |
||
156 | with open_compressed(filename, 'wb') as f: |
||
157 | for item in stream: |
||
158 | protobuf_stream_write(item, f) |
||
159 | |||
160 | |||
161 | class protobuf_stream_load(object): |
||
162 | def __init__(self, filename): |
||
163 | self.fd = open_compressed(filename, 'rb') |
||
164 | |||
165 | def __iter__(self): |
||
166 | return self |
||
167 | |||
168 | def next(self): |
||
169 | return protobuf_stream_read(self.fd) |
||
170 | |||
171 | def close(self): |
||
172 | self.fd.close() |
||
173 |