|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
namespace SCI\Bibtex; |
|
4
|
|
|
|
|
5
|
|
|
/** |
|
6
|
|
|
* @note most of the parsing code has been copied from PARSECREATORS therefore |
|
7
|
|
|
* thanks goes to the authors of http://bibliophile.sourceforge.net |
|
8
|
|
|
* |
|
9
|
|
|
* Comments to the source code can be found at |
|
10
|
|
|
* http://sourceforge.net/projects/bibliophile/files/bibtexParse/ released under |
|
11
|
|
|
* under the GPL license. |
|
12
|
|
|
* |
|
13
|
|
|
* @license GNU GPL v2+ |
|
14
|
|
|
* @since 1.0 |
|
15
|
|
|
*/ |
|
16
|
|
|
class BibtexAuthorListParser { |
|
17
|
|
|
|
|
18
|
|
|
/** |
|
19
|
|
|
* @var array |
|
20
|
|
|
*/ |
|
21
|
|
|
private $prefix = []; |
|
22
|
|
|
|
|
23
|
|
|
/** |
|
24
|
|
|
* Create writer arrays from bibtex input |
|
25
|
|
|
* |
|
26
|
|
|
* 'author field can be (delimiters between authors are 'and' or '&'): |
|
27
|
|
|
* 1. <first-tokens> <von-tokens> <last-tokens> |
|
28
|
|
|
* 2. <von-tokens> <last-tokens>, <first-tokens> |
|
29
|
|
|
* 3. <von-tokens> <last-tokens>, <jr-tokens>, <first-tokens> |
|
30
|
|
|
* |
|
31
|
|
|
* @since 1.0 |
|
32
|
|
|
* |
|
33
|
|
|
* @param string $input |
|
34
|
|
|
* |
|
35
|
|
|
* @return array |
|
36
|
|
|
*/ |
|
37
|
17 |
|
public function parse( $input ) { |
|
38
|
|
|
|
|
39
|
17 |
|
$authorList = []; |
|
40
|
|
|
|
|
41
|
|
|
// split on ' and ' |
|
42
|
17 |
|
$authorArray = preg_split("/\s(and|&)\s/i", trim( $input ) ); |
|
43
|
|
|
|
|
44
|
17 |
|
foreach( $authorArray as $value ) { |
|
45
|
17 |
|
$appellation = ''; |
|
46
|
17 |
|
$prefix = ''; |
|
47
|
|
|
|
|
48
|
17 |
|
$surname = ''; |
|
49
|
17 |
|
$initials = ''; |
|
|
|
|
|
|
50
|
|
|
|
|
51
|
17 |
|
$this->prefix = []; |
|
52
|
|
|
|
|
53
|
17 |
|
$author = explode( ",", preg_replace("/\s{2,}/", ' ', trim( $value ) ) ); |
|
54
|
17 |
|
$size = count( $author ); |
|
55
|
|
|
|
|
56
|
|
|
// No commas therefore something like Mark Grimshaw, Mark Nicholas Grimshaw, M N Grimshaw, Mark N. Grimshaw |
|
57
|
17 |
|
if( $size == 1 ) { |
|
58
|
|
|
// Is complete surname enclosed in {...}, unless the string starts with a backslash (\) because then it is |
|
59
|
|
|
// probably a special latex-sign.. |
|
60
|
|
|
// 2006.02.11 DR: in the last case, any NESTED curly braces should also be taken into account! so second |
|
61
|
|
|
// clause rules out things such as author="a{\"{o}}" |
|
62
|
|
|
// |
|
63
|
11 |
|
if( preg_match("/(.*){([^\\\].*)}/", $value, $matches) && |
|
64
|
11 |
|
!(preg_match("/(.*){\\\.{.*}.*}/", $value, $matches2 ) ) ) { |
|
65
|
2 |
|
$author = explode(" ", $matches[1]); |
|
66
|
2 |
|
$surname = $matches[2]; |
|
67
|
|
|
} else { |
|
68
|
9 |
|
$author = explode(" ", $value); |
|
69
|
|
|
// last of array is surname (no prefix if entered correctly) |
|
70
|
11 |
|
$surname = array_pop($author); |
|
71
|
|
|
} |
|
72
|
7 |
|
} elseif( $size == 2 ) { // Something like Grimshaw, Mark or Grimshaw, Mark Nicholas or Grimshaw, M N or Grimshaw, Mark N. |
|
73
|
|
|
// first of array is surname (perhaps with prefix) |
|
74
|
5 |
|
list( $surname, $prefix ) = $this->grabSurname( array_shift( $author ) ); |
|
75
|
|
|
} else { // If $size is 3, we're looking at something like Bush, Jr. III, George W |
|
76
|
|
|
// middle of array is 'Jr.', 'IV' etc. |
|
77
|
2 |
|
$appellation = implode(' ', array_splice( $author, 1, 1 ) ); |
|
78
|
|
|
// first of array is surname (perhaps with prefix) |
|
79
|
2 |
|
list( $surname, $prefix ) = $this->grabSurname( array_shift( $author ) ); |
|
80
|
|
|
} |
|
81
|
|
|
|
|
82
|
17 |
|
$remainder = implode( " ", $author ); |
|
83
|
|
|
|
|
84
|
17 |
|
list( $firstname, $initials ) = $this->grabFirstnameInitials( $remainder ); |
|
85
|
|
|
|
|
86
|
17 |
|
if( $this->prefix !== [] ) { |
|
87
|
3 |
|
$prefix = implode(' ', $this->prefix ); |
|
88
|
|
|
} |
|
89
|
|
|
|
|
90
|
17 |
|
$surname = $surname . ' ' . trim( $appellation ); |
|
91
|
|
|
|
|
92
|
17 |
|
$authorList[] = $this->concatenate( $firstname, $initials, $surname, $prefix ); |
|
93
|
|
|
} |
|
94
|
|
|
|
|
95
|
17 |
|
return $authorList; |
|
96
|
|
|
} |
|
97
|
|
|
|
|
98
|
17 |
|
private function concatenate( $firstname, $initials, $surname, $prefix ) { |
|
99
|
|
|
|
|
100
|
|
|
$author = [ |
|
101
|
17 |
|
trim( $firstname ), |
|
102
|
17 |
|
trim( $initials ), |
|
103
|
17 |
|
trim( $prefix ), |
|
104
|
17 |
|
trim( $surname ) |
|
105
|
|
|
]; |
|
106
|
|
|
|
|
107
|
17 |
|
return implode( ' ', array_filter( $author ) ); |
|
108
|
|
|
} |
|
109
|
|
|
|
|
110
|
|
|
/** |
|
111
|
|
|
* @note firstname and initials which may be of form "A.B.C." or "A. B. C. " or " A B C " etc. |
|
112
|
|
|
*/ |
|
113
|
17 |
|
private function grabFirstnameInitials( $remainder ) { |
|
114
|
|
|
|
|
115
|
17 |
|
$array = explode( " ", $remainder ); |
|
116
|
|
|
|
|
117
|
17 |
|
$firstname = ''; |
|
|
|
|
|
|
118
|
17 |
|
$initials = ''; |
|
119
|
|
|
|
|
120
|
17 |
|
$initialsArray = []; |
|
121
|
17 |
|
$firstnameArray = []; |
|
122
|
|
|
|
|
123
|
17 |
|
foreach( $array as $value ) { |
|
124
|
17 |
|
$firstChar = substr($value, 0, 1); |
|
125
|
|
|
|
|
126
|
17 |
|
if( ( ord( $firstChar ) >= 97 ) && ( ord( $firstChar ) <= 122) ) { |
|
127
|
3 |
|
$this->prefix[] = $value; |
|
128
|
17 |
|
} elseif( preg_match("/[a-zA-Z]{2,}/", trim( $value ) ) ) { |
|
129
|
13 |
|
$firstnameArray[] = trim($value); |
|
130
|
|
|
} else { |
|
131
|
17 |
|
$initialsArray[] = str_replace(".", " ", trim( $value ) ); |
|
132
|
|
|
} |
|
133
|
|
|
} |
|
134
|
|
|
|
|
135
|
17 |
|
foreach( $initialsArray as $initial) { |
|
136
|
12 |
|
$initials .= ' ' . trim ( $initial ); |
|
137
|
|
|
} |
|
138
|
|
|
|
|
139
|
17 |
|
$firstname = implode(" ", $firstnameArray); |
|
140
|
|
|
|
|
141
|
17 |
|
return [ $firstname, $initials ]; |
|
142
|
|
|
} |
|
143
|
|
|
|
|
144
|
|
|
/** |
|
145
|
|
|
* @note surname may have title such as 'den', 'von', 'de la' etc. - |
|
146
|
|
|
* characterised by first character lowercased. Any uppercased part means |
|
147
|
|
|
* lowercased parts following are part of the surname (e.g. Van den Bussche) |
|
148
|
|
|
*/ |
|
149
|
7 |
|
private function grabSurname( $input ) { |
|
150
|
7 |
|
$surnameArray = explode(" ", $input ); |
|
151
|
|
|
|
|
152
|
7 |
|
$noPrefix = false; |
|
153
|
7 |
|
$surname = []; |
|
154
|
7 |
|
$prefix = []; |
|
155
|
|
|
|
|
156
|
7 |
|
foreach( $surnameArray as $value ) { |
|
157
|
7 |
|
$firstChar = substr($value, 0, 1); |
|
158
|
|
|
|
|
159
|
7 |
|
if( !$noPrefix && ( ord( $firstChar ) >= 97 ) && ( ord( $firstChar ) <= 122 ) ) { |
|
160
|
3 |
|
$prefix[] = $value; |
|
161
|
|
|
} else { |
|
162
|
7 |
|
$surname[] = $value; |
|
163
|
7 |
|
$noPrefix = TRUE; |
|
164
|
|
|
} |
|
165
|
|
|
} |
|
166
|
|
|
|
|
167
|
7 |
|
$surname = implode(" ", $surname); |
|
168
|
|
|
|
|
169
|
7 |
|
if( $prefix !== [] ) { |
|
170
|
3 |
|
return [ $surname, implode(" ", $prefix ) ]; |
|
171
|
|
|
} |
|
172
|
|
|
|
|
173
|
4 |
|
return [ $surname, false ]; |
|
174
|
|
|
} |
|
175
|
|
|
} |
|
176
|
|
|
|