1 | <?php |
||
2 | |||
3 | namespace SCI\Bibtex; |
||
4 | |||
5 | /** |
||
6 | * @note most of the parsing code has been copied from PARSEENTRIES therefore |
||
7 | * thanks goes to the authors of http://bibliophile.sourceforge.net |
||
8 | * |
||
9 | * Comments to the source code can be found at |
||
10 | * http://sourceforge.net/projects/bibliophile/files/bibtexParse/ and is |
||
11 | * released under the GPL license. |
||
12 | * |
||
13 | * @note There might be a better parser out there but I didn't want to spend to |
||
14 | * much time reviewing code therefore PARSEENTRIES does the job well. |
||
15 | * |
||
16 | * Any fancy macro stuff or other complicated string parsing isn't supported |
||
17 | * given that the bibtex format misses a proper specification. PARSEENTRIES |
||
18 | * surely allows to cover more edge cases but for what we want to achieve (to ease |
||
19 | * copy and paste of existing bibtex records) the current implementation is |
||
20 | * sufficient. |
||
21 | * |
||
22 | * BibtexParserTest provides the test interface to verify edge cases. |
||
23 | * |
||
24 | * @license GNU GPL v2+ |
||
25 | * @since 1.0 |
||
26 | */ |
||
27 | class BibtexParser { |
||
28 | |||
29 | /** |
||
30 | * @var array |
||
31 | */ |
||
32 | private $undefinedStrings = []; |
||
0 ignored issues
–
show
introduced
by
![]() |
|||
33 | |||
34 | /** |
||
35 | * @var array |
||
36 | */ |
||
37 | private $strings = []; |
||
0 ignored issues
–
show
|
|||
38 | |||
39 | /** |
||
40 | * @since 1.0 |
||
41 | * |
||
42 | * @return array |
||
43 | */ |
||
44 | 7 | public function parse( $bibtex ) { |
|
45 | |||
46 | 7 | if ( ( $matches = $this->findBibtexFormatMatches( $bibtex ) ) === [] ) { |
|
47 | 2 | return []; |
|
48 | } |
||
49 | |||
50 | $head = [ |
||
51 | 5 | 'type' => strtolower( trim( $matches[1] ) ), |
|
52 | 5 | 'reference' => $matches[2] |
|
53 | ]; |
||
54 | |||
55 | 5 | return $head + $this->parseFields( $matches[3] ); |
|
56 | } |
||
57 | |||
58 | 7 | private function findBibtexFormatMatches( $bibtex ) { |
|
59 | |||
60 | 7 | $matches = preg_split("/@(.*)[{(](.*),/U", $bibtex, 2, PREG_SPLIT_DELIM_CAPTURE ); |
|
61 | |||
62 | // Silently retreat from processing |
||
63 | 7 | if ( !isset( $matches[2] ) ) { |
|
64 | 2 | return []; |
|
65 | } |
||
66 | |||
67 | 5 | if( preg_match("/=/", $matches[2] ) ) { |
|
68 | $matches = preg_split("/@(.*)\s*[{(](.*)/U", $bibtex, 2, PREG_SPLIT_DELIM_CAPTURE ); |
||
69 | } |
||
70 | |||
71 | 5 | return $matches; |
|
72 | } |
||
73 | |||
74 | 5 | private function parseFields( $content ) { |
|
75 | 5 | $elements = []; |
|
76 | 5 | $values = []; |
|
77 | |||
78 | 5 | $length = strlen( $content ); |
|
79 | |||
80 | 5 | if( $content[$length - 1] == "}" || $content[$length - 1] == ")" || $content[$length - 1] == ",") { |
|
81 | 5 | $content = substr( $content, 0, $length - 1 ); |
|
82 | } |
||
83 | |||
84 | 5 | $split = preg_split("/=/", $content, 2 ); |
|
85 | 5 | $string = $split[1]; |
|
86 | |||
87 | 5 | while( $string ) { |
|
88 | 5 | list( $entry, $string ) = $this->splitField( $string ); |
|
89 | 5 | $values[] = $entry; |
|
90 | } |
||
91 | |||
92 | 5 | foreach( $values as $value ) { |
|
93 | 5 | $pos = strpos( $content, $value); |
|
94 | 5 | $content = substr_replace( $content, '', $pos, strlen( $value ) ); |
|
95 | } |
||
96 | |||
97 | 5 | $rev = strrev( trim( $content ) ); |
|
98 | |||
99 | 5 | if( $rev[0] != ',') { |
|
100 | 5 | $content .= ','; |
|
101 | } |
||
102 | |||
103 | 5 | $keys = preg_split("/=,/", $content ); |
|
104 | 5 | array_pop($keys); |
|
105 | |||
106 | 5 | foreach( $keys as $key ) { |
|
107 | 5 | $value = trim( array_shift( $values ) ); |
|
108 | 5 | $rev = strrev( $value ); |
|
109 | |||
110 | // remove any dangling ',' left on final field of entry |
||
111 | 5 | if( $rev[0] == ',') { |
|
112 | 1 | $value = rtrim($value, ","); |
|
113 | } |
||
114 | |||
115 | 5 | if(!$value) { |
|
116 | continue; |
||
117 | } |
||
118 | |||
119 | 5 | $key = strtolower(trim($key)); |
|
120 | 5 | $value = trim($value); |
|
121 | 5 | $elements[$key] = $this->removeDelimiters( $value ); |
|
122 | } |
||
123 | |||
124 | 5 | return $elements; |
|
125 | } |
||
126 | |||
127 | 5 | private function splitField( $seg ) { |
|
128 | |||
129 | 5 | $array = preg_split("/,\s*([-_.:,a-zA-Z0-9]+)\s*={1}\s*/U", $seg, PREG_SPLIT_DELIM_CAPTURE ); |
|
130 | |||
131 | // if(!array_key_exists( 1, $array ) ) { |
||
132 | // return array( $array[0], FALSE); |
||
133 | // } |
||
134 | |||
135 | 5 | return isset( $array[1] ) ? [ $array[0], $array[1] ] : [ $array[0], false ]; |
|
136 | } |
||
137 | |||
138 | 5 | private function removeDelimiters( $string ) { |
|
139 | |||
140 | 5 | if( $string && ( $string[0] == "\"") ) { |
|
141 | 1 | $string = substr($string, 1); |
|
142 | 1 | $string = substr($string, 0, -1); |
|
143 | 5 | } else if ( $string && ( $string[0] == "{") ) { |
|
144 | 5 | if( strlen( $string ) > 0 && $string[strlen($string)-1] == "}" ) { |
|
145 | 5 | $string = substr($string, 1); |
|
146 | 5 | $string = substr($string, 0, -1); |
|
147 | } |
||
148 | |||
149 | // } else if(!is_numeric($string) && !array_key_exists($string, $this->strings) |
||
150 | // && (array_search($string, $this->undefinedStrings) === FALSE ) ) { |
||
151 | // $this->undefinedStrings[] = $string; // Undefined string that is not a year etc. |
||
152 | // return ''; |
||
153 | } |
||
154 | |||
155 | 5 | return $string; |
|
156 | } |
||
157 | } |
||
158 |