1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
declare( strict_types = 1 ); |
4
|
|
|
|
5
|
|
|
namespace Wikibase\JsonDumpReader; |
6
|
|
|
|
7
|
|
|
use Deserializers\Deserializer; |
8
|
|
|
use Iterator; |
9
|
|
|
use Wikibase\JsonDumpReader\Iterator\EntityDumpIterator; |
10
|
|
|
use Wikibase\JsonDumpReader\Iterator\ObjectDumpIterator; |
11
|
|
|
use Wikibase\JsonDumpReader\Reader\Bz2DumpReader; |
12
|
|
|
use Wikibase\JsonDumpReader\Reader\ExtractedDumpReader; |
13
|
|
|
use Wikibase\JsonDumpReader\Reader\GzDumpReader; |
14
|
|
|
|
15
|
|
|
/** |
16
|
|
|
* Package public |
17
|
|
|
* @since 1.0.0 |
18
|
|
|
* |
19
|
|
|
* @licence GNU GPL v2+ |
20
|
|
|
* @author Jeroen De Dauw < [email protected] > |
21
|
|
|
*/ |
22
|
|
|
class JsonDumpFactory { |
23
|
|
|
|
24
|
|
|
/** |
25
|
|
|
* Creates a DumpReader that can read lines from a bz2 compressed JSON dump. |
26
|
|
|
* @since 1.1.0 |
27
|
|
|
* |
28
|
|
|
* @param string $dumpFilePath |
29
|
|
|
* @param int $initialPosition since 1.3.0 |
30
|
|
|
* |
31
|
|
|
* @return SeekableDumpReader |
32
|
|
|
*/ |
33
|
3 |
|
public function newGzDumpReader( string $dumpFilePath, int $initialPosition = 0 ): SeekableDumpReader { |
34
|
3 |
|
return new GzDumpReader( $dumpFilePath, $initialPosition ); |
35
|
|
|
} |
36
|
|
|
|
37
|
|
|
/** |
38
|
|
|
* Creates a DumpReader that can read lines from an uncompressed JSON dump. |
39
|
|
|
* @since 1.0.0 |
40
|
|
|
* |
41
|
|
|
* @param string $dumpFilePath |
42
|
|
|
* @param int $initialPosition |
43
|
|
|
* |
44
|
|
|
* @return SeekableDumpReader |
45
|
|
|
*/ |
46
|
1 |
|
public function newExtractedDumpReader( string $dumpFilePath, int $initialPosition = 0 ): SeekableDumpReader { |
47
|
1 |
|
return new ExtractedDumpReader( $dumpFilePath, $initialPosition ); |
48
|
|
|
} |
49
|
|
|
|
50
|
|
|
/** |
51
|
|
|
* Creates a DumpReader that can read lines from a bz2 compressed JSON dump. |
52
|
|
|
* @since 1.0.0 |
53
|
|
|
* |
54
|
|
|
* @param string $dumpFilePath |
55
|
|
|
* |
56
|
|
|
* @return DumpReader |
57
|
|
|
*/ |
58
|
2 |
|
public function newBz2DumpReader( string $dumpFilePath ): DumpReader { |
59
|
2 |
|
return new Bz2DumpReader( $dumpFilePath ); |
60
|
|
|
} |
61
|
|
|
|
62
|
|
|
/** |
63
|
|
|
* Creates an Iterator over each JSON serialized Entity in the dump. |
64
|
|
|
* @since 1.0.0 |
65
|
|
|
* |
66
|
|
|
* @param DumpReader $dumpReader |
67
|
|
|
* @param callable $onError Gets called with a single string parameter on error |
68
|
|
|
* |
69
|
|
|
* @return Iterator string[] |
70
|
|
|
*/ |
71
|
16 |
|
public function newStringDumpIterator( DumpReader $dumpReader, callable $onError = null ): Iterator { |
72
|
|
|
$iterator = new \RewindableGenerator( function() use ( $dumpReader, $onError ) { |
73
|
16 |
|
while ( true ) { |
74
|
|
|
try { |
75
|
16 |
|
$line = $dumpReader->nextJsonLine(); |
76
|
|
|
|
77
|
16 |
|
if ( $line === null ) { |
78
|
13 |
|
return; |
79
|
|
|
} |
80
|
|
|
|
81
|
15 |
|
yield $line; |
82
|
|
|
} |
83
|
|
|
catch ( DumpReadingException $ex ) { |
84
|
|
|
if ( $onError !== null ) { |
85
|
|
|
call_user_func( $onError, $ex->getMessage() ); |
86
|
|
|
} |
87
|
|
|
} |
88
|
|
|
} |
89
|
16 |
|
} ); |
90
|
|
|
|
91
|
|
|
$iterator->onRewind( function() use ( $dumpReader ) { |
92
|
16 |
|
$dumpReader->rewind(); |
93
|
16 |
|
} ); |
94
|
|
|
|
95
|
16 |
|
return $iterator; |
96
|
|
|
} |
97
|
|
|
|
98
|
|
|
/** |
99
|
|
|
* Creates an Iterator over each Entity in the dump as PHP array/object in the JSON format. |
100
|
|
|
* This is essentially a json_decode map of the string dump iterator. |
101
|
|
|
* @since 1.0.0 |
102
|
|
|
* |
103
|
|
|
* @param DumpReader $dumpReader |
104
|
|
|
* @param callable $onError Gets called with a single string parameter on error |
105
|
|
|
* |
106
|
|
|
* @return Iterator array[] |
107
|
|
|
*/ |
108
|
14 |
|
public function newObjectDumpIterator( DumpReader $dumpReader, callable $onError = null ): Iterator { |
109
|
14 |
|
$iterator = new ObjectDumpIterator( |
110
|
14 |
|
$this->newStringDumpIterator( $dumpReader, $onError ) |
111
|
|
|
); |
112
|
|
|
|
113
|
14 |
|
$iterator->onError( $onError ); |
114
|
|
|
|
115
|
14 |
|
return $iterator; |
116
|
|
|
} |
117
|
|
|
|
118
|
|
|
/** |
119
|
|
|
* Creates an Iterator over each Entity in the dump, fully deserialized as EntityDocument. |
120
|
|
|
* @since 1.0.0 |
121
|
|
|
* |
122
|
|
|
* @param DumpReader $dumpReader |
123
|
|
|
* @param Deserializer $entityDeserializer |
124
|
|
|
* @param callable $onError Gets called with a single string parameter on error |
125
|
|
|
* |
126
|
|
|
* @return Iterator EntityDocument[] |
127
|
|
|
*/ |
128
|
8 |
|
public function newEntityDumpIterator( DumpReader $dumpReader, Deserializer $entityDeserializer, |
129
|
|
|
callable $onError = null ): Iterator { |
130
|
|
|
|
131
|
8 |
|
$iterator = new EntityDumpIterator( |
132
|
8 |
|
$this->newObjectDumpIterator( $dumpReader, $onError ), |
133
|
|
|
$entityDeserializer |
134
|
|
|
); |
135
|
|
|
|
136
|
8 |
|
$iterator->onError( $onError ); |
137
|
|
|
|
138
|
8 |
|
return $iterator; |
139
|
|
|
} |
140
|
|
|
|
141
|
|
|
} |