1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Wikibase\JsonDumpReader; |
4
|
|
|
|
5
|
|
|
use Deserializers\Deserializer; |
6
|
|
|
use Iterator; |
7
|
|
|
use Wikibase\JsonDumpReader\Iterator\EntityDumpIterator; |
8
|
|
|
use Wikibase\JsonDumpReader\Iterator\ObjectDumpIterator; |
9
|
|
|
use Wikibase\JsonDumpReader\Reader\Bz2DumpReader; |
10
|
|
|
use Wikibase\JsonDumpReader\Reader\ExtractedDumpReader; |
11
|
|
|
use Wikibase\JsonDumpReader\Reader\GzDumpReader; |
12
|
|
|
|
13
|
|
|
/** |
14
|
|
|
* Package public |
15
|
|
|
* @since 1.0.0 |
16
|
|
|
* |
17
|
|
|
* @licence GNU GPL v2+ |
18
|
|
|
* @author Jeroen De Dauw < [email protected] > |
19
|
|
|
*/ |
20
|
|
|
class JsonDumpFactory { |
21
|
|
|
|
22
|
|
|
/** |
23
|
|
|
* Creates a DumpReader that can read lines from a bz2 compressed JSON dump. |
24
|
|
|
* @since 1.1.0 |
25
|
|
|
* |
26
|
|
|
* @param string $dumpFilePath |
27
|
|
|
* @param int $initialPosition since 1.3.0 |
28
|
|
|
* |
29
|
|
|
* @return SeekableDumpReader |
30
|
|
|
*/ |
31
|
3 |
|
public function newGzDumpReader( $dumpFilePath, $initialPosition = 0 ) { |
32
|
3 |
|
return new GzDumpReader( $dumpFilePath, $initialPosition ); |
33
|
|
|
} |
34
|
|
|
|
35
|
|
|
/** |
36
|
|
|
* Creates a DumpReader that can read lines from an uncompressed JSON dump. |
37
|
|
|
* @since 1.0.0 |
38
|
|
|
* |
39
|
|
|
* @param string $dumpFilePath |
40
|
|
|
* @param int $initialPosition |
41
|
|
|
* |
42
|
|
|
* @return SeekableDumpReader |
43
|
|
|
*/ |
44
|
1 |
|
public function newExtractedDumpReader( $dumpFilePath, $initialPosition = 0 ) { |
45
|
1 |
|
return new ExtractedDumpReader( $dumpFilePath, $initialPosition ); |
46
|
|
|
} |
47
|
|
|
|
48
|
|
|
/** |
49
|
|
|
* Creates a DumpReader that can read lines from a bz2 compressed JSON dump. |
50
|
|
|
* @since 1.0.0 |
51
|
|
|
* |
52
|
|
|
* @param string $dumpFilePath |
53
|
|
|
* |
54
|
|
|
* @return DumpReader |
55
|
|
|
*/ |
56
|
2 |
|
public function newBz2DumpReader( $dumpFilePath ) { |
57
|
2 |
|
return new Bz2DumpReader( $dumpFilePath ); |
58
|
|
|
} |
59
|
|
|
|
60
|
|
|
/** |
61
|
|
|
* Creates an Iterator over each JSON serialized Entity in the dump. |
62
|
|
|
* @since 1.0.0 |
63
|
|
|
* |
64
|
|
|
* @param DumpReader $dumpReader |
65
|
|
|
* @param callable $onError Gets called with a single string parameter on error |
66
|
|
|
* |
67
|
|
|
* @return Iterator string[] |
68
|
|
|
*/ |
69
|
16 |
|
public function newStringDumpIterator( DumpReader $dumpReader, callable $onError = null ) { |
70
|
|
|
$iterator = new \RewindableGenerator( function() use ( $dumpReader, $onError ) { |
71
|
16 |
|
while ( true ) { |
72
|
|
|
try { |
73
|
16 |
|
$line = $dumpReader->nextJsonLine(); |
74
|
|
|
|
75
|
16 |
|
if ( $line === null ) { |
76
|
13 |
|
return; |
77
|
|
|
} |
78
|
|
|
|
79
|
15 |
|
yield $line; |
80
|
|
|
} |
81
|
|
|
catch ( DumpReadingException $ex ) { |
82
|
|
|
if ( $onError !== null ) { |
83
|
|
|
call_user_func( $onError, $ex->getMessage() ); |
84
|
|
|
} |
85
|
|
|
} |
86
|
|
|
} |
87
|
16 |
|
} ); |
88
|
|
|
|
89
|
16 |
|
$iterator->onRewind( function() use ( $dumpReader ) { |
90
|
16 |
|
$dumpReader->rewind(); |
91
|
16 |
|
} ); |
92
|
|
|
|
93
|
16 |
|
return $iterator; |
94
|
|
|
} |
95
|
|
|
|
96
|
|
|
/** |
97
|
|
|
* Creates an Iterator over each Entity in the dump as PHP array/object in the JSON format. |
98
|
|
|
* This is essentially a json_decode map of the string dump iterator. |
99
|
|
|
* @since 1.0.0 |
100
|
|
|
* |
101
|
|
|
* @param DumpReader $dumpReader |
102
|
|
|
* @param callable $onError Gets called with a single string parameter on error |
103
|
|
|
* |
104
|
|
|
* @return Iterator array[] |
105
|
|
|
*/ |
106
|
14 |
|
public function newObjectDumpIterator( DumpReader $dumpReader, callable $onError = null ) { |
107
|
14 |
|
$iterator = new ObjectDumpIterator( |
108
|
14 |
|
$this->newStringDumpIterator( $dumpReader, $onError ) |
109
|
|
|
); |
110
|
|
|
|
111
|
14 |
|
$iterator->onError( $onError ); |
112
|
|
|
|
113
|
14 |
|
return $iterator; |
114
|
|
|
} |
115
|
|
|
|
116
|
|
|
/** |
117
|
|
|
* Creates an Iterator over each Entity in the dump, fully deserialized as EntityDocument. |
118
|
|
|
* @since 1.0.0 |
119
|
|
|
* |
120
|
|
|
* @param DumpReader $dumpReader |
121
|
|
|
* @param Deserializer $entityDeserializer |
122
|
|
|
* @param callable $onError Gets called with a single string parameter on error |
123
|
|
|
* |
124
|
|
|
* @return Iterator EntityDocument[] |
125
|
|
|
*/ |
126
|
8 |
|
public function newEntityDumpIterator( DumpReader $dumpReader, Deserializer $entityDeserializer, callable $onError = null ) { |
127
|
8 |
|
$iterator = new EntityDumpIterator( |
128
|
8 |
|
$this->newObjectDumpIterator( $dumpReader, $onError ), |
129
|
|
|
$entityDeserializer |
130
|
|
|
); |
131
|
|
|
|
132
|
8 |
|
$iterator->onError( $onError ); |
133
|
|
|
|
134
|
8 |
|
return $iterator; |
135
|
|
|
} |
136
|
|
|
|
137
|
|
|
} |