SemanticMediaWiki /
SemanticCite
| 1 | <?php |
||
| 2 | |||
| 3 | namespace SCI\Bibtex; |
||
| 4 | |||
| 5 | /** |
||
| 6 | * @note most of the parsing code has been copied from PARSECREATORS therefore |
||
| 7 | * thanks goes to the authors of http://bibliophile.sourceforge.net |
||
| 8 | * |
||
| 9 | * Comments to the source code can be found at |
||
| 10 | * http://sourceforge.net/projects/bibliophile/files/bibtexParse/ released under |
||
| 11 | * under the GPL license. |
||
| 12 | * |
||
| 13 | * @license GNU GPL v2+ |
||
| 14 | * @since 1.0 |
||
| 15 | */ |
||
| 16 | class BibtexAuthorListParser { |
||
| 17 | |||
| 18 | /** |
||
| 19 | * @var array |
||
| 20 | */ |
||
| 21 | private $prefix = []; |
||
| 22 | |||
| 23 | /** |
||
| 24 | * Create writer arrays from bibtex input |
||
| 25 | * |
||
| 26 | * 'author field can be (delimiters between authors are 'and' or '&'): |
||
| 27 | * 1. <first-tokens> <von-tokens> <last-tokens> |
||
| 28 | * 2. <von-tokens> <last-tokens>, <first-tokens> |
||
| 29 | * 3. <von-tokens> <last-tokens>, <jr-tokens>, <first-tokens> |
||
| 30 | * |
||
| 31 | * @since 1.0 |
||
| 32 | * |
||
| 33 | * @param string $input |
||
| 34 | * |
||
| 35 | * @return array |
||
| 36 | */ |
||
| 37 | 17 | public function parse( $input ) { |
|
| 38 | |||
| 39 | 17 | $authorList = []; |
|
| 40 | |||
| 41 | // split on ' and ' |
||
| 42 | 17 | $authorArray = preg_split("/\s(and|&)\s/i", trim( $input ) ); |
|
| 43 | |||
| 44 | 17 | foreach( $authorArray as $value ) { |
|
| 45 | 17 | $appellation = ''; |
|
| 46 | 17 | $prefix = ''; |
|
| 47 | |||
| 48 | 17 | $surname = ''; |
|
| 49 | 17 | $initials = ''; |
|
|
0 ignored issues
–
show
Unused Code
introduced
by
Loading history...
|
|||
| 50 | |||
| 51 | 17 | $this->prefix = []; |
|
| 52 | |||
| 53 | 17 | $author = explode( ",", preg_replace("/\s{2,}/", ' ', trim( $value ) ) ); |
|
| 54 | 17 | $size = count( $author ); |
|
| 55 | |||
| 56 | // No commas therefore something like Mark Grimshaw, Mark Nicholas Grimshaw, M N Grimshaw, Mark N. Grimshaw |
||
| 57 | 17 | if( $size == 1 ) { |
|
| 58 | // Is complete surname enclosed in {...}, unless the string starts with a backslash (\) because then it is |
||
| 59 | // probably a special latex-sign.. |
||
| 60 | // 2006.02.11 DR: in the last case, any NESTED curly braces should also be taken into account! so second |
||
| 61 | // clause rules out things such as author="a{\"{o}}" |
||
| 62 | // |
||
| 63 | 11 | if( preg_match("/(.*){([^\\\].*)}/", $value, $matches) && |
|
| 64 | 11 | !(preg_match("/(.*){\\\.{.*}.*}/", $value, $matches2 ) ) ) { |
|
| 65 | 2 | $author = explode(" ", $matches[1]); |
|
| 66 | 2 | $surname = $matches[2]; |
|
| 67 | } else { |
||
| 68 | 9 | $author = explode(" ", $value); |
|
| 69 | // last of array is surname (no prefix if entered correctly) |
||
| 70 | 11 | $surname = array_pop($author); |
|
| 71 | } |
||
| 72 | 7 | } elseif( $size == 2 ) { // Something like Grimshaw, Mark or Grimshaw, Mark Nicholas or Grimshaw, M N or Grimshaw, Mark N. |
|
| 73 | // first of array is surname (perhaps with prefix) |
||
| 74 | 5 | list( $surname, $prefix ) = $this->grabSurname( array_shift( $author ) ); |
|
| 75 | } else { // If $size is 3, we're looking at something like Bush, Jr. III, George W |
||
| 76 | // middle of array is 'Jr.', 'IV' etc. |
||
| 77 | 2 | $appellation = implode(' ', array_splice( $author, 1, 1 ) ); |
|
| 78 | // first of array is surname (perhaps with prefix) |
||
| 79 | 2 | list( $surname, $prefix ) = $this->grabSurname( array_shift( $author ) ); |
|
| 80 | } |
||
| 81 | |||
| 82 | 17 | $remainder = implode( " ", $author ); |
|
| 83 | |||
| 84 | 17 | list( $firstname, $initials ) = $this->grabFirstnameInitials( $remainder ); |
|
| 85 | |||
| 86 | 17 | if( $this->prefix !== [] ) { |
|
| 87 | 3 | $prefix = implode(' ', $this->prefix ); |
|
| 88 | } |
||
| 89 | |||
| 90 | 17 | $surname = $surname . ' ' . trim( $appellation ); |
|
| 91 | |||
| 92 | 17 | $authorList[] = $this->concatenate( $firstname, $initials, $surname, $prefix ); |
|
| 93 | } |
||
| 94 | |||
| 95 | 17 | return $authorList; |
|
| 96 | } |
||
| 97 | |||
| 98 | 17 | private function concatenate( $firstname, $initials, $surname, $prefix ) { |
|
| 99 | |||
| 100 | $author = [ |
||
| 101 | 17 | trim( $firstname ), |
|
| 102 | 17 | trim( $initials ), |
|
| 103 | 17 | trim( $prefix ), |
|
| 104 | 17 | trim( $surname ) |
|
| 105 | ]; |
||
| 106 | |||
| 107 | 17 | return implode( ' ', array_filter( $author ) ); |
|
| 108 | } |
||
| 109 | |||
| 110 | /** |
||
| 111 | * @note firstname and initials which may be of form "A.B.C." or "A. B. C. " or " A B C " etc. |
||
| 112 | */ |
||
| 113 | 17 | private function grabFirstnameInitials( $remainder ) { |
|
| 114 | |||
| 115 | 17 | $array = explode( " ", $remainder ); |
|
| 116 | |||
| 117 | 17 | $firstname = ''; |
|
|
0 ignored issues
–
show
|
|||
| 118 | 17 | $initials = ''; |
|
| 119 | |||
| 120 | 17 | $initialsArray = []; |
|
| 121 | 17 | $firstnameArray = []; |
|
| 122 | |||
| 123 | 17 | foreach( $array as $value ) { |
|
| 124 | 17 | $firstChar = substr($value, 0, 1); |
|
| 125 | |||
| 126 | 17 | if( ( ord( $firstChar ) >= 97 ) && ( ord( $firstChar ) <= 122) ) { |
|
| 127 | 3 | $this->prefix[] = $value; |
|
| 128 | 17 | } elseif( preg_match("/[a-zA-Z]{2,}/", trim( $value ) ) ) { |
|
| 129 | 13 | $firstnameArray[] = trim($value); |
|
| 130 | } else { |
||
| 131 | 17 | $initialsArray[] = str_replace(".", " ", trim( $value ) ); |
|
| 132 | } |
||
| 133 | } |
||
| 134 | |||
| 135 | 17 | foreach( $initialsArray as $initial) { |
|
| 136 | 12 | $initials .= ' ' . trim ( $initial ); |
|
| 137 | } |
||
| 138 | |||
| 139 | 17 | $firstname = implode(" ", $firstnameArray); |
|
| 140 | |||
| 141 | 17 | return [ $firstname, $initials ]; |
|
| 142 | } |
||
| 143 | |||
| 144 | /** |
||
| 145 | * @note surname may have title such as 'den', 'von', 'de la' etc. - |
||
| 146 | * characterised by first character lowercased. Any uppercased part means |
||
| 147 | * lowercased parts following are part of the surname (e.g. Van den Bussche) |
||
| 148 | */ |
||
| 149 | 7 | private function grabSurname( $input ) { |
|
| 150 | 7 | $surnameArray = explode(" ", $input ); |
|
| 151 | |||
| 152 | 7 | $noPrefix = false; |
|
| 153 | 7 | $surname = []; |
|
| 154 | 7 | $prefix = []; |
|
| 155 | |||
| 156 | 7 | foreach( $surnameArray as $value ) { |
|
| 157 | 7 | $firstChar = substr($value, 0, 1); |
|
| 158 | |||
| 159 | 7 | if( !$noPrefix && ( ord( $firstChar ) >= 97 ) && ( ord( $firstChar ) <= 122 ) ) { |
|
| 160 | 3 | $prefix[] = $value; |
|
| 161 | } else { |
||
| 162 | 7 | $surname[] = $value; |
|
| 163 | 7 | $noPrefix = TRUE; |
|
| 164 | } |
||
| 165 | } |
||
| 166 | |||
| 167 | 7 | $surname = implode(" ", $surname); |
|
| 168 | |||
| 169 | 7 | if( $prefix !== [] ) { |
|
| 170 | 3 | return [ $surname, implode(" ", $prefix ) ]; |
|
| 171 | } |
||
| 172 | |||
| 173 | 4 | return [ $surname, false ]; |
|
| 174 | } |
||
| 175 | } |
||
| 176 |