SemanticMediaWiki /
SemanticCite
| 1 | <?php |
||
| 2 | |||
| 3 | namespace SCI\Bibtex; |
||
| 4 | |||
| 5 | /** |
||
| 6 | * @note most of the parsing code has been copied from PARSEENTRIES therefore |
||
| 7 | * thanks goes to the authors of http://bibliophile.sourceforge.net |
||
| 8 | * |
||
| 9 | * Comments to the source code can be found at |
||
| 10 | * http://sourceforge.net/projects/bibliophile/files/bibtexParse/ and is |
||
| 11 | * released under the GPL license. |
||
| 12 | * |
||
| 13 | * @note There might be a better parser out there but I didn't want to spend to |
||
| 14 | * much time reviewing code therefore PARSEENTRIES does the job well. |
||
| 15 | * |
||
| 16 | * Any fancy macro stuff or other complicated string parsing isn't supported |
||
| 17 | * given that the bibtex format misses a proper specification. PARSEENTRIES |
||
| 18 | * surely allows to cover more edge cases but for what we want to achieve (to ease |
||
| 19 | * copy and paste of existing bibtex records) the current implementation is |
||
| 20 | * sufficient. |
||
| 21 | * |
||
| 22 | * BibtexParserTest provides the test interface to verify edge cases. |
||
| 23 | * |
||
| 24 | * @license GNU GPL v2+ |
||
| 25 | * @since 1.0 |
||
| 26 | */ |
||
| 27 | class BibtexParser { |
||
| 28 | |||
| 29 | /** |
||
| 30 | * @var array |
||
| 31 | */ |
||
| 32 | private $undefinedStrings = []; |
||
|
0 ignored issues
–
show
introduced
by
Loading history...
|
|||
| 33 | |||
| 34 | /** |
||
| 35 | * @var array |
||
| 36 | */ |
||
| 37 | private $strings = []; |
||
|
0 ignored issues
–
show
|
|||
| 38 | |||
| 39 | /** |
||
| 40 | * @since 1.0 |
||
| 41 | * |
||
| 42 | * @return array |
||
| 43 | */ |
||
| 44 | 7 | public function parse( $bibtex ) { |
|
| 45 | |||
| 46 | 7 | if ( ( $matches = $this->findBibtexFormatMatches( $bibtex ) ) === [] ) { |
|
| 47 | 2 | return []; |
|
| 48 | } |
||
| 49 | |||
| 50 | $head = [ |
||
| 51 | 5 | 'type' => strtolower( trim( $matches[1] ) ), |
|
| 52 | 5 | 'reference' => $matches[2] |
|
| 53 | ]; |
||
| 54 | |||
| 55 | 5 | return $head + $this->parseFields( $matches[3] ); |
|
| 56 | } |
||
| 57 | |||
| 58 | 7 | private function findBibtexFormatMatches( $bibtex ) { |
|
| 59 | |||
| 60 | 7 | $matches = preg_split("/@(.*)[{(](.*),/U", $bibtex, 2, PREG_SPLIT_DELIM_CAPTURE ); |
|
| 61 | |||
| 62 | // Silently retreat from processing |
||
| 63 | 7 | if ( !isset( $matches[2] ) ) { |
|
| 64 | 2 | return []; |
|
| 65 | } |
||
| 66 | |||
| 67 | 5 | if( preg_match("/=/", $matches[2] ) ) { |
|
| 68 | $matches = preg_split("/@(.*)\s*[{(](.*)/U", $bibtex, 2, PREG_SPLIT_DELIM_CAPTURE ); |
||
| 69 | } |
||
| 70 | |||
| 71 | 5 | return $matches; |
|
| 72 | } |
||
| 73 | |||
| 74 | 5 | private function parseFields( $content ) { |
|
| 75 | 5 | $elements = []; |
|
| 76 | 5 | $values = []; |
|
| 77 | |||
| 78 | 5 | $length = strlen( $content ); |
|
| 79 | |||
| 80 | 5 | if( $content[$length - 1] == "}" || $content[$length - 1] == ")" || $content[$length - 1] == ",") { |
|
| 81 | 5 | $content = substr( $content, 0, $length - 1 ); |
|
| 82 | } |
||
| 83 | |||
| 84 | 5 | $split = preg_split("/=/", $content, 2 ); |
|
| 85 | 5 | $string = $split[1]; |
|
| 86 | |||
| 87 | 5 | while( $string ) { |
|
| 88 | 5 | list( $entry, $string ) = $this->splitField( $string ); |
|
| 89 | 5 | $values[] = $entry; |
|
| 90 | } |
||
| 91 | |||
| 92 | 5 | foreach( $values as $value ) { |
|
| 93 | 5 | $pos = strpos( $content, $value); |
|
| 94 | 5 | $content = substr_replace( $content, '', $pos, strlen( $value ) ); |
|
| 95 | } |
||
| 96 | |||
| 97 | 5 | $rev = strrev( trim( $content ) ); |
|
| 98 | |||
| 99 | 5 | if( $rev[0] != ',') { |
|
| 100 | 5 | $content .= ','; |
|
| 101 | } |
||
| 102 | |||
| 103 | 5 | $keys = preg_split("/=,/", $content ); |
|
| 104 | 5 | array_pop($keys); |
|
| 105 | |||
| 106 | 5 | foreach( $keys as $key ) { |
|
| 107 | 5 | $value = trim( array_shift( $values ) ); |
|
| 108 | 5 | $rev = strrev( $value ); |
|
| 109 | |||
| 110 | // remove any dangling ',' left on final field of entry |
||
| 111 | 5 | if( $rev[0] == ',') { |
|
| 112 | 1 | $value = rtrim($value, ","); |
|
| 113 | } |
||
| 114 | |||
| 115 | 5 | if(!$value) { |
|
| 116 | continue; |
||
| 117 | } |
||
| 118 | |||
| 119 | 5 | $key = strtolower(trim($key)); |
|
| 120 | 5 | $value = trim($value); |
|
| 121 | 5 | $elements[$key] = $this->removeDelimiters( $value ); |
|
| 122 | } |
||
| 123 | |||
| 124 | 5 | return $elements; |
|
| 125 | } |
||
| 126 | |||
| 127 | 5 | private function splitField( $seg ) { |
|
| 128 | |||
| 129 | 5 | $array = preg_split("/,\s*([-_.:,a-zA-Z0-9]+)\s*={1}\s*/U", $seg, PREG_SPLIT_DELIM_CAPTURE ); |
|
| 130 | |||
| 131 | // if(!array_key_exists( 1, $array ) ) { |
||
| 132 | // return array( $array[0], FALSE); |
||
| 133 | // } |
||
| 134 | |||
| 135 | 5 | return isset( $array[1] ) ? [ $array[0], $array[1] ] : [ $array[0], false ]; |
|
| 136 | } |
||
| 137 | |||
| 138 | 5 | private function removeDelimiters( $string ) { |
|
| 139 | |||
| 140 | 5 | if( $string && ( $string[0] == "\"") ) { |
|
| 141 | 1 | $string = substr($string, 1); |
|
| 142 | 1 | $string = substr($string, 0, -1); |
|
| 143 | 5 | } else if ( $string && ( $string[0] == "{") ) { |
|
| 144 | 5 | if( strlen( $string ) > 0 && $string[strlen($string)-1] == "}" ) { |
|
| 145 | 5 | $string = substr($string, 1); |
|
| 146 | 5 | $string = substr($string, 0, -1); |
|
| 147 | } |
||
| 148 | |||
| 149 | // } else if(!is_numeric($string) && !array_key_exists($string, $this->strings) |
||
| 150 | // && (array_search($string, $this->undefinedStrings) === FALSE ) ) { |
||
| 151 | // $this->undefinedStrings[] = $string; // Undefined string that is not a year etc. |
||
| 152 | // return ''; |
||
| 153 | } |
||
| 154 | |||
| 155 | 5 | return $string; |
|
| 156 | } |
||
| 157 | } |
||
| 158 |