1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace SCI\Bibtex; |
4
|
|
|
|
5
|
|
|
/** |
6
|
|
|
* @note most of the parsing code has been copied from PARSECREATORS therefore |
7
|
|
|
* thanks goes to the authors of http://bibliophile.sourceforge.net |
8
|
|
|
* |
9
|
|
|
* Comments to the source code can be found at |
10
|
|
|
* http://sourceforge.net/projects/bibliophile/files/bibtexParse/ released under |
11
|
|
|
* under the GPL license. |
12
|
|
|
* |
13
|
|
|
* @license GNU GPL v2+ |
14
|
|
|
* @since 1.0 |
15
|
|
|
*/ |
16
|
|
|
class BibtexAuthorListParser { |
17
|
|
|
|
18
|
|
|
/** |
19
|
|
|
* @var array |
20
|
|
|
*/ |
21
|
|
|
private $prefix = []; |
22
|
|
|
|
23
|
|
|
/** |
24
|
|
|
* Create writer arrays from bibtex input |
25
|
|
|
* |
26
|
|
|
* 'author field can be (delimiters between authors are 'and' or '&'): |
27
|
|
|
* 1. <first-tokens> <von-tokens> <last-tokens> |
28
|
|
|
* 2. <von-tokens> <last-tokens>, <first-tokens> |
29
|
|
|
* 3. <von-tokens> <last-tokens>, <jr-tokens>, <first-tokens> |
30
|
|
|
* |
31
|
|
|
* @since 1.0 |
32
|
|
|
* |
33
|
|
|
* @param string $input |
34
|
|
|
* |
35
|
|
|
* @return array |
36
|
|
|
*/ |
37
|
17 |
|
public function parse( $input ) { |
38
|
|
|
|
39
|
17 |
|
$authorList = []; |
40
|
|
|
|
41
|
|
|
// split on ' and ' |
42
|
17 |
|
$authorArray = preg_split("/\s(and|&)\s/i", trim( $input ) ); |
43
|
|
|
|
44
|
17 |
|
foreach( $authorArray as $value ) { |
45
|
17 |
|
$appellation = ''; |
46
|
17 |
|
$prefix = ''; |
47
|
|
|
|
48
|
17 |
|
$surname = ''; |
49
|
17 |
|
$initials = ''; |
|
|
|
|
50
|
|
|
|
51
|
17 |
|
$this->prefix = []; |
52
|
|
|
|
53
|
17 |
|
$author = explode( ",", preg_replace("/\s{2,}/", ' ', trim( $value ) ) ); |
54
|
17 |
|
$size = count( $author ); |
55
|
|
|
|
56
|
|
|
// No commas therefore something like Mark Grimshaw, Mark Nicholas Grimshaw, M N Grimshaw, Mark N. Grimshaw |
57
|
17 |
|
if( $size == 1 ) { |
58
|
|
|
// Is complete surname enclosed in {...}, unless the string starts with a backslash (\) because then it is |
59
|
|
|
// probably a special latex-sign.. |
60
|
|
|
// 2006.02.11 DR: in the last case, any NESTED curly braces should also be taken into account! so second |
61
|
|
|
// clause rules out things such as author="a{\"{o}}" |
62
|
|
|
// |
63
|
11 |
|
if( preg_match("/(.*){([^\\\].*)}/", $value, $matches) && |
64
|
11 |
|
!(preg_match("/(.*){\\\.{.*}.*}/", $value, $matches2 ) ) ) { |
65
|
2 |
|
$author = explode(" ", $matches[1]); |
66
|
2 |
|
$surname = $matches[2]; |
67
|
|
|
} else { |
68
|
9 |
|
$author = explode(" ", $value); |
69
|
|
|
// last of array is surname (no prefix if entered correctly) |
70
|
11 |
|
$surname = array_pop($author); |
71
|
|
|
} |
72
|
7 |
|
} elseif( $size == 2 ) { // Something like Grimshaw, Mark or Grimshaw, Mark Nicholas or Grimshaw, M N or Grimshaw, Mark N. |
73
|
|
|
// first of array is surname (perhaps with prefix) |
74
|
5 |
|
list( $surname, $prefix ) = $this->grabSurname( array_shift( $author ) ); |
75
|
|
|
} else { // If $size is 3, we're looking at something like Bush, Jr. III, George W |
76
|
|
|
// middle of array is 'Jr.', 'IV' etc. |
77
|
2 |
|
$appellation = implode(' ', array_splice( $author, 1, 1 ) ); |
78
|
|
|
// first of array is surname (perhaps with prefix) |
79
|
2 |
|
list( $surname, $prefix ) = $this->grabSurname( array_shift( $author ) ); |
80
|
|
|
} |
81
|
|
|
|
82
|
17 |
|
$remainder = implode( " ", $author ); |
83
|
|
|
|
84
|
17 |
|
list( $firstname, $initials ) = $this->grabFirstnameInitials( $remainder ); |
85
|
|
|
|
86
|
17 |
|
if( $this->prefix !== [] ) { |
87
|
3 |
|
$prefix = implode(' ', $this->prefix ); |
88
|
|
|
} |
89
|
|
|
|
90
|
17 |
|
$surname = $surname . ' ' . trim( $appellation ); |
91
|
|
|
|
92
|
17 |
|
$authorList[] = $this->concatenate( $firstname, $initials, $surname, $prefix ); |
93
|
|
|
} |
94
|
|
|
|
95
|
17 |
|
return $authorList; |
96
|
|
|
} |
97
|
|
|
|
98
|
17 |
|
private function concatenate( $firstname, $initials, $surname, $prefix ) { |
99
|
|
|
|
100
|
|
|
$author = [ |
101
|
17 |
|
trim( $firstname ), |
102
|
17 |
|
trim( $initials ), |
103
|
17 |
|
trim( $prefix ), |
104
|
17 |
|
trim( $surname ) |
105
|
|
|
]; |
106
|
|
|
|
107
|
17 |
|
return implode( ' ', array_filter( $author ) ); |
108
|
|
|
} |
109
|
|
|
|
110
|
|
|
/** |
111
|
|
|
* @note firstname and initials which may be of form "A.B.C." or "A. B. C. " or " A B C " etc. |
112
|
|
|
*/ |
113
|
17 |
|
private function grabFirstnameInitials( $remainder ) { |
114
|
|
|
|
115
|
17 |
|
$array = explode( " ", $remainder ); |
116
|
|
|
|
117
|
17 |
|
$firstname = ''; |
|
|
|
|
118
|
17 |
|
$initials = ''; |
119
|
|
|
|
120
|
17 |
|
$initialsArray = []; |
121
|
17 |
|
$firstnameArray = []; |
122
|
|
|
|
123
|
17 |
|
foreach( $array as $value ) { |
124
|
17 |
|
$firstChar = substr($value, 0, 1); |
125
|
|
|
|
126
|
17 |
|
if( ( ord( $firstChar ) >= 97 ) && ( ord( $firstChar ) <= 122) ) { |
127
|
3 |
|
$this->prefix[] = $value; |
128
|
17 |
|
} elseif( preg_match("/[a-zA-Z]{2,}/", trim( $value ) ) ) { |
129
|
13 |
|
$firstnameArray[] = trim($value); |
130
|
|
|
} else { |
131
|
17 |
|
$initialsArray[] = str_replace(".", " ", trim( $value ) ); |
132
|
|
|
} |
133
|
|
|
} |
134
|
|
|
|
135
|
17 |
|
foreach( $initialsArray as $initial) { |
136
|
12 |
|
$initials .= ' ' . trim ( $initial ); |
137
|
|
|
} |
138
|
|
|
|
139
|
17 |
|
$firstname = implode(" ", $firstnameArray); |
140
|
|
|
|
141
|
17 |
|
return [ $firstname, $initials ]; |
142
|
|
|
} |
143
|
|
|
|
144
|
|
|
/** |
145
|
|
|
* @note surname may have title such as 'den', 'von', 'de la' etc. - |
146
|
|
|
* characterised by first character lowercased. Any uppercased part means |
147
|
|
|
* lowercased parts following are part of the surname (e.g. Van den Bussche) |
148
|
|
|
*/ |
149
|
7 |
|
private function grabSurname( $input ) { |
150
|
7 |
|
$surnameArray = explode(" ", $input ); |
151
|
|
|
|
152
|
7 |
|
$noPrefix = false; |
153
|
7 |
|
$surname = []; |
154
|
7 |
|
$prefix = []; |
155
|
|
|
|
156
|
7 |
|
foreach( $surnameArray as $value ) { |
157
|
7 |
|
$firstChar = substr($value, 0, 1); |
158
|
|
|
|
159
|
7 |
|
if( !$noPrefix && ( ord( $firstChar ) >= 97 ) && ( ord( $firstChar ) <= 122 ) ) { |
160
|
3 |
|
$prefix[] = $value; |
161
|
|
|
} else { |
162
|
7 |
|
$surname[] = $value; |
163
|
7 |
|
$noPrefix = TRUE; |
164
|
|
|
} |
165
|
|
|
} |
166
|
|
|
|
167
|
7 |
|
$surname = implode(" ", $surname); |
168
|
|
|
|
169
|
7 |
|
if( $prefix !== [] ) { |
170
|
3 |
|
return [ $surname, implode(" ", $prefix ) ]; |
171
|
|
|
} |
172
|
|
|
|
173
|
4 |
|
return [ $surname, false ]; |
174
|
|
|
} |
175
|
|
|
} |
176
|
|
|
|