Passed
Branch master (b37043)
by Timo
11:18
created

Apache_Solr_Service::extractFromString()   C

Complexity

Conditions 7
Paths 5

Size

Total Lines 42
Code Lines 21

Duplication

Lines 7
Ratio 16.67 %

Importance

Changes 0
Metric Value
cc 7
eloc 21
nc 5
nop 4
dl 7
loc 42
rs 6.7272
c 0
b 0
f 0
1
<?php
2
/**
3
 * Copyright (c) 2007-2012, Servigistics, Inc.
4
 * All rights reserved.
5
 *
6
 * Redistribution and use in source and binary forms, with or without
7
 * modification, are permitted provided that the following conditions are met:
8
 *
9
 *  - Redistributions of source code must retain the above copyright notice,
10
 *    this list of conditions and the following disclaimer.
11
 *  - Redistributions in binary form must reproduce the above copyright
12
 *    notice, this list of conditions and the following disclaimer in the
13
 *    documentation and/or other materials provided with the distribution.
14
 *  - Neither the name of Servigistics, Inc. nor the names of
15
 *    its contributors may be used to endorse or promote products derived from
16
 *    this software without specific prior written permission.
17
 *
18
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28
 * POSSIBILITY OF SUCH DAMAGE.
29
 *
30
 * @copyright Copyright 2007-2012 Servigistics, Inc. (http://servigistics.com)
31
 * @license http://solr-php-client.googlecode.com/svn/trunk/COPYING New BSD
32
 * @version $Id$
33
 *
34
 * @package Apache
35
 * @subpackage Solr
36
 * @author Donovan Jimenez <[email protected]>
37
 */
38
39
// See Issue #1 (http://code.google.com/p/solr-php-client/issues/detail?id=1)
40
// Doesn't follow typical include path conventions, but is more convenient for users
41
require_once(dirname(__FILE__) . '/Exception.php');
42
require_once(dirname(__FILE__) . '/HttpTransportException.php');
43
require_once(dirname(__FILE__) . '/InvalidArgumentException.php');
44
45
require_once(dirname(__FILE__) . '/Document.php');
46
require_once(dirname(__FILE__) . '/Response.php');
47
48
require_once(dirname(__FILE__) . '/HttpTransport/Interface.php');
49
50
/**
51
 * Starting point for the Solr API. Represents a Solr server resource and has
52
 * methods for pinging, adding, deleting, committing, optimizing and searching.
53
 *
54
 * Example Usage:
55
 * <code>
56
 * ...
57
 * $solr = new Apache_Solr_Service(); //or explicitly new Apache_Solr_Service('localhost', 8180, '/solr')
58
 *
59
 * if ($solr->ping())
60
 * {
61
 *        $solr->deleteByQuery('*:*'); //deletes ALL documents - be careful :)
62
 *
63
 *        $document = new Apache_Solr_Document();
64
 *        $document->id = uniqid(); //or something else suitably unique
65
 *
66
 *        $document->title = 'Some Title';
67
 *        $document->content = 'Some content for this wonderful document. Blah blah blah.';
68
 *
69
 *        $solr->addDocument($document);    //if you're going to be adding documents in bulk using addDocuments
70
 *                                        //with an array of documents is faster
71
 *
72
 *        $solr->commit(); //commit to see the deletes and the document
73
 *        $solr->optimize(); //merges multiple segments into one
74
 *
75
 *        //and the one we all care about, search!
76
 *        //any other common or custom parameters to the request handler can go in the
77
 *        //optional 4th array argument.
78
 *        $solr->search('content:blah', 0, 10, array('sort' => 'timestamp desc'));
79
 * }
80
 * ...
81
 * </code>
82
 *
83
 * @todo Investigate using other HTTP clients other than file_get_contents built-in handler. Could provide performance
84
 * improvements when dealing with multiple requests by using HTTP's keep alive functionality
85
 */
86
class Apache_Solr_Service
87
{
88
    /**
89
     * SVN Revision meta data for this class
90
     */
91
    const SVN_REVISION = '$Revision$';
92
93
    /**
94
     * SVN ID meta data for this class
95
     */
96
    const SVN_ID = '$Id$';
97
98
    /**
99
     * Response writer we'll request - JSON. See http://code.google.com/p/solr-php-client/issues/detail?id=6#c1 for reasoning
100
     */
101
    const SOLR_WRITER = 'json';
102
103
    /**
104
     * NamedList Treatment constants
105
     */
106
    const NAMED_LIST_FLAT = 'flat';
107
    const NAMED_LIST_MAP = 'map';
108
109
    /**
110
     * Search HTTP Methods
111
     */
112
    const METHOD_GET = 'GET';
113
    const METHOD_POST = 'POST';
114
115
    /**
116
     * Servlet mappings
117
     */
118
    const PING_SERVLET = 'admin/ping';
119
    const UPDATE_SERVLET = 'update';
120
    const SEARCH_SERVLET = 'select';
121
    const SYSTEM_SERVLET = 'admin/system';
122
    const THREADS_SERVLET = 'admin/threads';
123
    const EXTRACT_SERVLET = 'update/extract';
124
125
    /**
126
     * Server identification strings
127
     *
128
     * @var string
129
     */
130
    protected $_host, $_port, $_path;
131
132
    /**
133
     * Whether {@link Apache_Solr_Response} objects should create {@link Apache_Solr_Document}s in
134
     * the returned parsed data
135
     *
136
     * @var boolean
137
     */
138
    protected $_createDocuments = true;
139
140
    /**
141
     * Whether {@link Apache_Solr_Response} objects should have multivalue fields with only a single value
142
     * collapsed to appear as a single value would.
143
     *
144
     * @var boolean
145
     */
146
    protected $_collapseSingleValueArrays = true;
147
148
    /**
149
     * How NamedLists should be formatted in the output.  This specifically effects facet counts. Valid values
150
     * are {@link Apache_Solr_Service::NAMED_LIST_MAP} (default) or {@link Apache_Solr_Service::NAMED_LIST_FLAT}.
151
     *
152
     * @var string
153
     */
154
    protected $_namedListTreatment = self::NAMED_LIST_MAP;
155
156
    /**
157
     * Query delimiters. Someone might want to be able to change
158
     * these (to use &amp; instead of & for example), so I've provided them.
159
     *
160
     * @var string
161
     */
162
    protected $_queryDelimiter = '?', $_queryStringDelimiter = '&', $_queryBracketsEscaped = true;
163
164
    /**
165
     * Constructed servlet full path URLs
166
     *
167
     * @var string
168
     */
169
    protected $_pingUrl, $_updateUrl, $_searchUrl, $_systemUrl, $_threadsUrl;
170
171
    /**
172
     * Keep track of whether our URLs have been constructed
173
     *
174
     * @var boolean
175
     */
176
    protected $_urlsInited = false;
177
178
    /**
179
     * HTTP Transport implementation (pluggable)
180
     *
181
     * @var Apache_Solr_HttpTransport_Interface
182
     */
183
    protected $_httpTransport = false;
184
185
    /**
186
     * Constructor. All parameters are optional and will take on default values
187
     * if not specified.
188
     *
189
     * @param string $host
190
     * @param string $port
191
     * @param string $path
192
     * @param Apache_Solr_HttpTransport_Interface $httpTransport
193
     */
194
    public function __construct(
195
        $host = 'localhost',
196
        $port = 8180,
197
        $path = '/solr/',
198
        $httpTransport = false
199
    ) {
200
        $this->setHost($host);
201
        $this->setPort($port);
202
        $this->setPath($path);
203
204
        $this->_initUrls();
205
206
        if ($httpTransport) {
207
            $this->setHttpTransport($httpTransport);
208
        }
209
210
        // check that our php version is >= 5.1.3 so we can correct for http_build_query behavior later
211
        $this->_queryBracketsEscaped = version_compare(phpversion(), '5.1.3',
212
            '>=');
213
    }
214
215
    /**
216
     * Set the port used. If empty will fallback to constants
217
     *
218
     * @param integer $port
219
     *
220
     * @throws Apache_Solr_InvalidArgumentException If the port parameter is empty
221
     */
222
    public function setPort($port)
223
    {
224
        //Use the provided port or use the default
225
        $port = (int)$port;
226
227
        if ($port <= 0) {
228
            throw new Apache_Solr_InvalidArgumentException('Port is not a valid port number');
229
        } else {
230
            $this->_port = $port;
231
        }
232
233
        if ($this->_urlsInited) {
234
            $this->_initUrls();
235
        }
236
    }
237
238
    /**
239
     * Construct the Full URLs for the three servlets we reference
240
     */
241
    protected function _initUrls()
242
    {
243
        //Initialize our full servlet URLs now that we have server information
244
        $this->_extractUrl = $this->_constructUrl(self::EXTRACT_SERVLET);
245
        $this->_pingUrl = $this->_constructUrl(self::PING_SERVLET);
246
        $this->_searchUrl = $this->_constructUrl(self::SEARCH_SERVLET);
247
        $this->_systemUrl = $this->_constructUrl(self::SYSTEM_SERVLET,
248
            array('wt' => self::SOLR_WRITER));
249
        $this->_threadsUrl = $this->_constructUrl(self::THREADS_SERVLET,
250
            array('wt' => self::SOLR_WRITER));
251
        $this->_updateUrl = $this->_constructUrl(self::UPDATE_SERVLET,
252
            array('wt' => self::SOLR_WRITER));
253
254
        $this->_urlsInited = true;
255
    }
256
257
    /**
258
     * Return a valid http URL given this server's host, port and path and a provided servlet name
259
     *
260
     * @param string $servlet
261
     * @return string
262
     */
263
    protected function _constructUrl($servlet, $params = array())
264
    {
265
        if (count($params)) {
266
            //escape all parameters appropriately for inclusion in the query string
267
            $escapedParams = array();
268
269
            foreach ($params as $key => $value) {
270
                $escapedParams[] = urlencode($key) . '=' . urlencode($value);
271
            }
272
273
            $queryString = $this->_queryDelimiter . implode($this->_queryStringDelimiter,
274
                    $escapedParams);
275
        } else {
276
            $queryString = '';
277
        }
278
279
        return 'http://' . $this->_host . ':' . $this->_port . $this->_path . $servlet . $queryString;
280
    }
281
282
    /**
283
     * Set the path used. If empty will fallback to constants
284
     *
285
     * @param string $path
286
     */
287
    public function setPath($path)
288
    {
289
        $path = trim($path, '/');
290
291
        if (strlen($path) > 0) {
292
            $this->_path = '/' . $path . '/';
293
        } else {
294
            $this->_path = '/';
295
        }
296
297
        if ($this->_urlsInited) {
298
            $this->_initUrls();
299
        }
300
    }
301
302
    /**
303
     * Escape a value for special query characters such as ':', '(', ')', '*', '?', etc.
304
     *
305
     * NOTE: inside a phrase fewer characters need escaped, use {@link Apache_Solr_Service::escapePhrase()} instead
306
     *
307
     * @param string $value
308
     * @return string
309
     */
310
    public static function escape($value)
311
    {
312
        //list taken from http://lucene.apache.org/java/docs/queryparsersyntax.html#Escaping%20Special%20Characters
313
        $pattern = '/(\+|-|&&|\|\||!|\(|\)|\{|}|\[|]|\^|"|~|\*|\?|:|\\\)/';
314
        $replace = '\\\$1';
315
316
        return preg_replace($pattern, $replace, $value);
317
    }
318
319
    /**
320
     * Convenience function for creating phrase syntax from a value
321
     *
322
     * @param string $value
323
     * @return string
324
     */
325
    public static function phrase($value)
326
    {
327
        return '"' . self::escapePhrase($value) . '"';
328
    }
329
330
    /**
331
     * Escape a value meant to be contained in a phrase for special query characters
332
     *
333
     * @param string $value
334
     * @return string
335
     */
336
    public static function escapePhrase($value)
337
    {
338
        $pattern = '/("|\\\)/';
339
        $replace = '\\\$1';
340
341
        return preg_replace($pattern, $replace, $value);
342
    }
343
344
    /**
345
     * Returns the set host
346
     *
347
     * @return string
348
     */
349
    public function getHost()
350
    {
351
        return $this->_host;
352
    }
353
354
    /**
355
     * Set the host used. If empty will fallback to constants
356
     *
357
     * @param string $host
358
     *
359
     * @throws Apache_Solr_InvalidArgumentException If the host parameter is empty
360
     */
361
    public function setHost($host)
362
    {
363
        //Use the provided host or use the default
364
        if (empty($host)) {
365
            throw new Apache_Solr_InvalidArgumentException('Host parameter is empty');
366
        } else {
367
            $this->_host = $host;
368
        }
369
370
        if ($this->_urlsInited) {
371
            $this->_initUrls();
372
        }
373
    }
374
375
    /**
376
     * Get the set port
377
     *
378
     * @return integer
379
     */
380
    public function getPort()
381
    {
382
        return $this->_port;
383
    }
384
385
    /**
386
     * Get the set path.
387
     *
388
     * @return string
389
     */
390
    public function getPath()
391
    {
392
        return $this->_path;
393
    }
394
395
    /**
396
     * Get the current state of teh create documents flag.
397
     *
398
     * @return boolean
399
     */
400
    public function getCreateDocuments()
401
    {
402
        return $this->_createDocuments;
403
    }
404
405
    /**
406
     * Set the create documents flag. This determines whether {@link Apache_Solr_Response} objects will
407
     * parse the response and create {@link Apache_Solr_Document} instances in place.
408
     *
409
     * @param boolean $createDocuments
410
     */
411
    public function setCreateDocuments($createDocuments)
412
    {
413
        $this->_createDocuments = (bool)$createDocuments;
414
    }
415
416
    /**
417
     * Get the current state of the collapse single value arrays flag.
418
     *
419
     * @return boolean
420
     */
421
    public function getCollapseSingleValueArrays()
422
    {
423
        return $this->_collapseSingleValueArrays;
424
    }
425
426
    /**
427
     * Set the collapse single value arrays flag.
428
     *
429
     * @param boolean $collapseSingleValueArrays
430
     */
431
    public function setCollapseSingleValueArrays($collapseSingleValueArrays)
432
    {
433
        $this->_collapseSingleValueArrays = (bool)$collapseSingleValueArrays;
434
    }
435
436
    /**
437
     * Get the current default timeout setting (initially the default_socket_timeout ini setting)
438
     * in seconds
439
     *
440
     * @return float
441
     *
442
     * @deprecated Use the getDefaultTimeout method on the HTTP transport implementation
443
     */
444
    public function getDefaultTimeout()
445
    {
446
        return $this->getHttpTransport()->getDefaultTimeout();
447
    }
448
449
    /**
450
     * Get the current configured HTTP Transport
451
     *
452
     * @return Apache_Solr_HttpTransport_Interface
453
     */
454
    public function getHttpTransport()
455
    {
456
        // lazy load a default if one has not be set
457
        if ($this->_httpTransport === false) {
458
            require_once(dirname(__FILE__) . '/HttpTransport/FileGetContents.php');
459
460
            $this->_httpTransport = new Apache_Solr_HttpTransport_FileGetContents();
461
        }
462
463
        return $this->_httpTransport;
464
    }
465
466
    /**
467
     * Set the HTTP Transport implemenation that will be used for all HTTP requests
468
     *
469
     * @param Apache_Solr_HttpTransport_Interface
470
     */
471
    public function setHttpTransport(
472
        Apache_Solr_HttpTransport_Interface $httpTransport
473
    ) {
474
        $this->_httpTransport = $httpTransport;
475
    }
476
477
    /**
478
     * Set the default timeout for all calls that aren't passed a specific timeout
479
     *
480
     * @param float $timeout Timeout value in seconds
481
     *
482
     * @deprecated Use the setDefaultTimeout method on the HTTP transport implementation
483
     */
484
    public function setDefaultTimeout($timeout)
485
    {
486
        $this->getHttpTransport()->setDefaultTimeout($timeout);
487
    }
488
489
    /**
490
     * Convenience method to set authentication credentials on the current HTTP transport implementation
491
     *
492
     * @param string $username
493
     * @param string $password
494
     */
495
    public function setAuthenticationCredentials($username, $password)
496
    {
497
        $this->getHttpTransport()->setAuthenticationCredentials($username,
498
            $password);
499
    }
500
501
    /**
502
     * Get the current setting for named list treatment.
503
     *
504
     * @return string
505
     */
506
    public function getNamedListTreatment()
507
    {
508
        return $this->_namedListTreatment;
509
    }
510
511
    /**
512
     * Set how NamedLists should be formatted in the response data. This mainly effects
513
     * the facet counts format.
514
     *
515
     * @param string $namedListTreatment
516
     * @throws Apache_Solr_InvalidArgumentException If invalid option is set
517
     */
518
    public function setNamedListTreatment($namedListTreatment)
519
    {
520
        switch ((string)$namedListTreatment) {
521
            case Apache_Solr_Service::NAMED_LIST_FLAT:
522
                $this->_namedListTreatment = Apache_Solr_Service::NAMED_LIST_FLAT;
523
                break;
524
525
            case Apache_Solr_Service::NAMED_LIST_MAP:
526
                $this->_namedListTreatment = Apache_Solr_Service::NAMED_LIST_MAP;
527
                break;
528
529
            default:
530
                throw new Apache_Solr_InvalidArgumentException('Not a valid named list treatement option');
531
        }
532
    }
533
534
    /**
535
     * Set the string used to separate the path form the query string.
536
     * Defaulted to '?'
537
     *
538
     * @param string $queryDelimiter
539
     */
540
    public function setQueryDelimiter($queryDelimiter)
541
    {
542
        $this->_queryDelimiter = $queryDelimiter;
543
    }
544
545
    /**
546
     * Set the string used to separate the parameters in thequery string
547
     * Defaulted to '&'
548
     *
549
     * @param string $queryStringDelimiter
550
     */
551
    public function setQueryStringDelimiter($queryStringDelimiter)
552
    {
553
        $this->_queryStringDelimiter = $queryStringDelimiter;
554
    }
555
556
    /**
557
     * Call the /admin/ping servlet, can be used to quickly tell if a connection to the
558
     * server is able to be made.
559
     *
560
     * @param float $timeout maximum time to wait for ping in seconds, -1 for unlimited (default is 2)
561
     * @return float Actual time taken to ping the server, FALSE if timeout or HTTP error status occurs
562
     */
563
    public function ping($timeout = 2)
564
    {
565
        $start = microtime(true);
566
567
        $httpTransport = $this->getHttpTransport();
568
569
        $httpResponse = $httpTransport->performHeadRequest($this->_pingUrl,
570
            $timeout);
571
        $solrResponse = new Apache_Solr_Response($httpResponse,
572
            $this->_createDocuments, $this->_collapseSingleValueArrays);
573
574
        if ($solrResponse->getHttpStatus() == 200) {
575
            return microtime(true) - $start;
576
        } else {
577
            return false;
578
        }
579
    }
580
581
    /**
582
     * Call the /admin/system servlet and retrieve system information about Solr
583
     *
584
     * @return Apache_Solr_Response
585
     *
586
     * @throws Apache_Solr_HttpTransportException If an error occurs during the service call
587
     */
588
    public function system()
589
    {
590
        return $this->_sendRawGet($this->_systemUrl);
591
    }
592
593
    /**
594
     * Central method for making a get operation against this Solr Server
595
     *
596
     * @param string $url
597
     * @param float $timeout Read timeout in seconds
598
     * @return Apache_Solr_Response
599
     *
600
     * @throws Apache_Solr_HttpTransportException If a non 200 response status is returned
601
     */
602
    protected function _sendRawGet($url, $timeout = false)
603
    {
604
        $httpTransport = $this->getHttpTransport();
605
606
        $httpResponse = $httpTransport->performGetRequest($url, $timeout);
607
        $solrResponse = new Apache_Solr_Response($httpResponse,
608
            $this->_createDocuments, $this->_collapseSingleValueArrays);
609
610
        if ($solrResponse->getHttpStatus() != 200) {
611
            throw new Apache_Solr_HttpTransportException($solrResponse);
612
        }
613
614
        return $solrResponse;
615
    }
616
617
    /**
618
     * Call the /admin/threads servlet and retrieve information about all threads in the
619
     * Solr servlet's thread group. Useful for diagnostics.
620
     *
621
     * @return Apache_Solr_Response
622
     *
623
     * @throws Apache_Solr_HttpTransportException If an error occurs during the service call
624
     */
625
    public function threads()
626
    {
627
        return $this->_sendRawGet($this->_threadsUrl);
628
    }
629
630
    /**
631
     * Add a Solr Document to the index
632
     *
633
     * @param Apache_Solr_Document $document
634
     * @param boolean $allowDups
635
     * @param boolean $overwritePending
636
     * @param boolean $overwriteCommitted
637
     * @param integer $commitWithin The number of milliseconds that a document must be committed within, see @{link http://wiki.apache.org/solr/UpdateXmlMessages#The_Update_Schema} for details.  If left empty this property will not be set in the request.
638
     * @return Apache_Solr_Response
639
     *
640
     * @throws Apache_Solr_HttpTransportException If an error occurs during the service call
641
     */
642
    public function addDocument(
643
        Apache_Solr_Document $document,
644
        $allowDups = false,
645
        $overwritePending = true,
646
        $overwriteCommitted = true,
647
        $commitWithin = 0
648
    ) {
649
        $dupValue = $allowDups ? 'true' : 'false';
650
        $pendingValue = $overwritePending ? 'true' : 'false';
651
        $committedValue = $overwriteCommitted ? 'true' : 'false';
652
653
        $commitWithin = (int)$commitWithin;
654
        $commitWithinString = $commitWithin > 0 ? " commitWithin=\"{$commitWithin}\"" : '';
655
656
        $rawPost = "<add{$commitWithinString}>";
657
        $rawPost .= $this->_documentToXmlFragment($document);
658
        $rawPost .= '</add>';
659
660
        return $this->add($rawPost);
661
    }
662
663
    /**
664
     * Create an XML fragment from a {@link Apache_Solr_Document} instance appropriate for use inside a Solr add call
665
     *
666
     * @return string
667
     */
668
    protected function _documentToXmlFragment(Apache_Solr_Document $document)
669
    {
670
        $xml = '<doc';
671
672
        if ($document->getBoost() !== false) {
673
            $xml .= ' boost="' . $document->getBoost() . '"';
674
        }
675
676
        $xml .= '>';
677
678
        foreach ($document as $key => $value) {
679
            $fieldBoost = $document->getFieldBoost($key);
680
            $key = htmlspecialchars($key, ENT_QUOTES, 'UTF-8');
681
682
            if (is_array($value)) {
683
                foreach ($value as $multivalue) {
684
                    $xml .= '<field name="' . $key . '"';
685
686
                    if ($fieldBoost !== false) {
687
                        $xml .= ' boost="' . $fieldBoost . '"';
688
689
                        // only set the boost for the first field in the set
690
                        $fieldBoost = false;
691
                    }
692
693
                    $multivalue = htmlspecialchars($multivalue, ENT_NOQUOTES,
694
                        'UTF-8');
695
696
                    $xml .= '>' . $multivalue . '</field>';
697
                }
698
            } else {
699
                $xml .= '<field name="' . $key . '"';
700
701
                if ($fieldBoost !== false) {
702
                    $xml .= ' boost="' . $fieldBoost . '"';
703
                }
704
705
                $value = htmlspecialchars($value, ENT_NOQUOTES, 'UTF-8');
706
707
                $xml .= '>' . $value . '</field>';
708
            }
709
        }
710
711
        $xml .= '</doc>';
712
713
        // replace any control characters to avoid Solr XML parser exception
714
        return $this->_stripCtrlChars($xml);
715
    }
716
717
    /**
718
     * Replace control (non-printable) characters from string that are invalid to Solr's XML parser with a space.
719
     *
720
     * @param string $string
721
     * @return string
722
     */
723
    protected function _stripCtrlChars($string)
724
    {
725
        // See:  http://w3.org/International/questions/qa-forms-utf-8.html
726
        // Printable utf-8 does not include any of these chars below x7F
727
        return preg_replace('@[\x00-\x08\x0B\x0C\x0E-\x1F]@', ' ', $string);
728
    }
729
730
    /**
731
     * Raw Add Method. Takes a raw post body and sends it to the update service.  Post body
732
     * should be a complete and well formed "add" xml document.
733
     *
734
     * @param string $rawPost
735
     * @return Apache_Solr_Response
736
     *
737
     * @throws Apache_Solr_HttpTransportException If an error occurs during the service call
738
     */
739
    public function add($rawPost)
740
    {
741
        return $this->_sendRawPost($this->_updateUrl, $rawPost);
742
    }
743
744
    /**
745
     * Central method for making a post operation against this Solr Server
746
     *
747
     * @param string $url
748
     * @param string $rawPost
749
     * @param float $timeout Read timeout in seconds
750
     * @param string $contentType
751
     * @return Apache_Solr_Response
752
     *
753
     * @throws Apache_Solr_HttpTransportException If a non 200 response status is returned
754
     */
755
    protected function _sendRawPost(
756
        $url,
757
        $rawPost,
758
        $timeout = false,
759
        $contentType = 'text/xml; charset=UTF-8'
760
    ) {
761
        $httpTransport = $this->getHttpTransport();
762
763
        $httpResponse = $httpTransport->performPostRequest($url, $rawPost,
764
            $contentType, $timeout);
765
        $solrResponse = new Apache_Solr_Response($httpResponse,
766
            $this->_createDocuments, $this->_collapseSingleValueArrays);
767
768
        if ($solrResponse->getHttpStatus() != 200) {
769
            throw new Apache_Solr_HttpTransportException($solrResponse);
770
        }
771
772
        return $solrResponse;
773
    }
774
775
    /**
776
     * Add an array of Solr Documents to the index all at once
777
     *
778
     * @param array $documents Should be an array of Apache_Solr_Document instances
779
     * @param boolean $allowDups
780
     * @param boolean $overwritePending
781
     * @param boolean $overwriteCommitted
782
     * @param integer $commitWithin The number of milliseconds that a document must be committed within, see @{link http://wiki.apache.org/solr/UpdateXmlMessages#The_Update_Schema} for details.  If left empty this property will not be set in the request.
783
     * @return Apache_Solr_Response
784
     *
785
     * @throws Apache_Solr_HttpTransportException If an error occurs during the service call
786
     */
787
    public function addDocuments(
788
        $documents,
789
        $allowDups = false,
790
        $overwritePending = true,
791
        $overwriteCommitted = true,
792
        $commitWithin = 0
793
    ) {
794
        $dupValue = $allowDups ? 'true' : 'false';
795
        $pendingValue = $overwritePending ? 'true' : 'false';
796
        $committedValue = $overwriteCommitted ? 'true' : 'false';
797
798
        $commitWithin = (int)$commitWithin;
799
        $commitWithinString = $commitWithin > 0 ? " commitWithin=\"{$commitWithin}\"" : '';
800
801
        $rawPost = "<add{$commitWithinString}>";
802
803
        foreach ($documents as $document) {
804
            if ($document instanceof Apache_Solr_Document) {
805
                $rawPost .= $this->_documentToXmlFragment($document);
806
            }
807
        }
808
809
        $rawPost .= '</add>';
810
811
        return $this->add($rawPost);
812
    }
813
814
    /**
815
     * Send a commit command.  Will be synchronous unless both wait parameters are set to false.
816
     *
817
     * @param boolean $expungeDeletes Defaults to false, merge segments with deletes away
818
     * @param boolean $waitFlush Not used, will be removed with EXT:solr version 4.0
819
     * @param boolean $waitSearcher Defaults to true, block until a new searcher is opened and registered as the main query searcher, making the changes visible
820
     * @param float $timeout Maximum expected duration (in seconds) of the commit operation on the server (otherwise, will throw a communication exception). Defaults to 1 hour
821
     * @return Apache_Solr_Response
822
     *
823
     * @throws Apache_Solr_HttpTransportException If an error occurs during the service call
824
     */
825
    public function commit(
826
        $expungeDeletes = false,
827
        $waitFlush = true,
828
        $waitSearcher = true,
829
        $timeout = 3600
830
    ) {
831
        $expungeValue = $expungeDeletes ? 'true' : 'false';
832
        $searcherValue = $waitSearcher ? 'true' : 'false';
833
834
        $rawPost = '<commit expungeDeletes="' . $expungeValue . '" waitSearcher="' . $searcherValue . '" />';
835
836
        return $this->_sendRawPost($this->_updateUrl, $rawPost, $timeout);
837
    }
838
839
    /**
840
     * Create a delete document based on document ID
841
     *
842
     * @param string $id Expected to be utf-8 encoded
843
     * @param boolean $fromPending
844
     * @param boolean $fromCommitted
845
     * @param float $timeout Maximum expected duration of the delete operation on the server (otherwise, will throw a communication exception)
846
     * @return Apache_Solr_Response
847
     *
848
     * @throws Apache_Solr_HttpTransportException If an error occurs during the service call
849
     */
850
    public function deleteById(
851
        $id,
852
        $fromPending = true,
853
        $fromCommitted = true,
854
        $timeout = 3600
855
    ) {
856
        $pendingValue = $fromPending ? 'true' : 'false';
857
        $committedValue = $fromCommitted ? 'true' : 'false';
858
859
        //escape special xml characters
860
        $id = htmlspecialchars($id, ENT_NOQUOTES, 'UTF-8');
861
862
        $rawPost = '<delete fromPending="' . $pendingValue . '" fromCommitted="' . $committedValue . '"><id>' . $id . '</id></delete>';
863
864
        return $this->delete($rawPost, $timeout);
865
    }
866
867
    /**
868
     * Raw Delete Method. Takes a raw post body and sends it to the update service. Body should be
869
     * a complete and well formed "delete" xml document
870
     *
871
     * @param string $rawPost Expected to be utf-8 encoded xml document
872
     * @param float $timeout Maximum expected duration of the delete operation on the server (otherwise, will throw a communication exception)
873
     * @return Apache_Solr_Response
874
     *
875
     * @throws Apache_Solr_HttpTransportException If an error occurs during the service call
876
     */
877
    public function delete($rawPost, $timeout = 3600)
878
    {
879
        return $this->_sendRawPost($this->_updateUrl, $rawPost, $timeout);
880
    }
881
882
    /**
883
     * Create and post a delete document based on multiple document IDs.
884
     *
885
     * @param array $ids Expected to be utf-8 encoded strings
886
     * @param boolean $fromPending
887
     * @param boolean $fromCommitted
888
     * @param float $timeout Maximum expected duration of the delete operation on the server (otherwise, will throw a communication exception)
889
     * @return Apache_Solr_Response
890
     *
891
     * @throws Apache_Solr_HttpTransportException If an error occurs during the service call
892
     */
893
    public function deleteByMultipleIds(
894
        $ids,
895
        $fromPending = true,
896
        $fromCommitted = true,
897
        $timeout = 3600
898
    ) {
899
        $pendingValue = $fromPending ? 'true' : 'false';
900
        $committedValue = $fromCommitted ? 'true' : 'false';
901
902
        $rawPost = '<delete fromPending="' . $pendingValue . '" fromCommitted="' . $committedValue . '">';
903
904
        foreach ($ids as $id) {
905
            //escape special xml characters
906
            $id = htmlspecialchars($id, ENT_NOQUOTES, 'UTF-8');
907
908
            $rawPost .= '<id>' . $id . '</id>';
909
        }
910
911
        $rawPost .= '</delete>';
912
913
        return $this->delete($rawPost, $timeout);
914
    }
915
916
    /**
917
     * Create a delete document based on a query and submit it
918
     *
919
     * @param string $rawQuery Expected to be utf-8 encoded
920
     * @param boolean $fromPending
921
     * @param boolean $fromCommitted
922
     * @param float $timeout Maximum expected duration of the delete operation on the server (otherwise, will throw a communication exception)
923
     * @return Apache_Solr_Response
924
     *
925
     * @throws Apache_Solr_HttpTransportException If an error occurs during the service call
926
     */
927
    public function deleteByQuery(
928
        $rawQuery,
929
        $fromPending = true,
930
        $fromCommitted = true,
931
        $timeout = 3600
932
    ) {
933
        $pendingValue = $fromPending ? 'true' : 'false';
934
        $committedValue = $fromCommitted ? 'true' : 'false';
935
936
        // escape special xml characters
937
        $rawQuery = htmlspecialchars($rawQuery, ENT_NOQUOTES, 'UTF-8');
938
939
        $rawPost = '<delete fromPending="' . $pendingValue . '" fromCommitted="' . $committedValue . '"><query>' . $rawQuery . '</query></delete>';
940
941
        return $this->delete($rawPost, $timeout);
942
    }
943
944
    /**
945
     * Use Solr Cell to extract document contents. See {@link http://wiki.apache.org/solr/ExtractingRequestHandler} for information on how
946
     * to use Solr Cell and what parameters are available.
947
     *
948
     * NOTE: when passing an Apache_Solr_Document instance, field names and boosts will automatically be prepended by "literal." and "boost."
949
     * as appropriate. Any keys from the $params array will NOT be treated this way. Any mappings from the document will overwrite key / value
950
     * pairs in the params array if they have the same name (e.g. you pass a "literal.id" key and value in your $params array but you also
951
     * pass in a document isntance with an "id" field" - the document's value(s) will take precedence).
952
     *
953
     * @param string $file Path to file to extract data from
954
     * @param array $params optional array of key value pairs that will be sent with the post (see Solr Cell documentation)
955
     * @param Apache_Solr_Document $document optional document that will be used to generate post parameters (literal.* and boost.* params)
956
     * @param string $mimetype optional mimetype specification (for the file being extracted)
957
     *
958
     * @return Apache_Solr_Response
959
     *
960
     * @throws Apache_Solr_InvalidArgumentException if $file, $params, or $document are invalid.
961
     */
962
    public function extract(
963
        $file,
964
        $params = array(),
965
        $document = null,
966
        $mimetype = 'application/octet-stream'
967
    ) {
968
        // check if $params is an array (allow null for default empty array)
969
        if (!is_null($params)) {
970
            if (!is_array($params)) {
971
                throw new Apache_Solr_InvalidArgumentException("\$params must be a valid array or null");
972
            }
973
        } else {
974
            $params = array();
975
        }
976
977
        // if $file is an http request, defer to extractFromUrl instead
978
        if (substr($file, 0, 7) == 'http://' || substr($file, 0,
979
                8) == 'https://'
980
        ) {
981
            return $this->extractFromUrl($file, $params, $document, $mimetype);
982
        }
983
984
        // read the contents of the file
985
        $contents = @file_get_contents($file);
986
987
        if ($contents !== false) {
988
            // add the resource.name parameter if not specified
989
            if (!isset($params['resource.name'])) {
990
                $params['resource.name'] = basename($file);
991
            }
992
993
            // delegate the rest to extractFromString
994
            return $this->extractFromString($contents, $params, $document,
995
                $mimetype);
996
        } else {
997
            throw new Apache_Solr_InvalidArgumentException("File '{$file}' is empty or could not be read");
998
        }
999
    }
1000
1001
    /**
1002
     * Use Solr Cell to extract document contents. See {@link http://wiki.apache.org/solr/ExtractingRequestHandler} for information on how
1003
     * to use Solr Cell and what parameters are available.
1004
     *
1005
     * NOTE: when passing an Apache_Solr_Document instance, field names and boosts will automatically be prepended by "literal." and "boost."
1006
     * as appropriate. Any keys from the $params array will NOT be treated this way. Any mappings from the document will overwrite key / value
1007
     * pairs in the params array if they have the same name (e.g. you pass a "literal.id" key and value in your $params array but you also
1008
     * pass in a document isntance with an "id" field" - the document's value(s) will take precedence).
1009
     *
1010
     * @param string $url URL
1011
     * @param array $params optional array of key value pairs that will be sent with the post (see Solr Cell documentation)
1012
     * @param Apache_Solr_Document $document optional document that will be used to generate post parameters (literal.* and boost.* params)
1013
     * @param string $mimetype optional mimetype specification (for the file being extracted)
1014
     *
1015
     * @return Apache_Solr_Response
1016
     *
1017
     * @throws Apache_Solr_InvalidArgumentException if $url, $params, or $document are invalid.
1018
     */
1019
    public function extractFromUrl(
1020
        $url,
1021
        $params = array(),
1022
        $document = null,
1023
        $mimetype = 'application/octet-stream'
1024
    ) {
1025
        // check if $params is an array (allow null for default empty array)
1026
        if (!is_null($params)) {
1027
            if (!is_array($params)) {
1028
                throw new Apache_Solr_InvalidArgumentException("\$params must be a valid array or null");
1029
            }
1030
        } else {
1031
            $params = array();
1032
        }
1033
1034
        $httpTransport = $this->getHttpTransport();
1035
1036
        // read the contents of the URL using our configured Http Transport and default timeout
1037
        $httpResponse = $httpTransport->performGetRequest($url);
1038
1039
        // check that its a 200 response
1040
        if ($httpResponse->getStatusCode() == 200) {
1041
            // add the resource.name parameter if not specified
1042
            if (!isset($params['resource.name'])) {
1043
                $params['resource.name'] = $url;
1044
            }
1045
1046
            // delegate the rest to extractFromString
1047
            return $this->extractFromString($httpResponse->getBody(), $params,
1048
                $document, $mimetype);
1049
        } else {
1050
            throw new Apache_Solr_InvalidArgumentException("URL '{$url}' returned non 200 response code");
1051
        }
1052
    }
1053
1054
    /**
1055
     * Use Solr Cell to extract document contents. See {@link http://wiki.apache.org/solr/ExtractingRequestHandler} for information on how
1056
     * to use Solr Cell and what parameters are available.
1057
     *
1058
     * NOTE: when passing an Apache_Solr_Document instance, field names and boosts will automatically be prepended by "literal." and "boost."
1059
     * as appropriate. Any keys from the $params array will NOT be treated this way. Any mappings from the document will overwrite key / value
1060
     * pairs in the params array if they have the same name (e.g. you pass a "literal.id" key and value in your $params array but you also
1061
     * pass in a document isntance with an "id" field" - the document's value(s) will take precedence).
1062
     *
1063
     * @param string $data Data that will be passed to Solr Cell
1064
     * @param array $params optional array of key value pairs that will be sent with the post (see Solr Cell documentation)
1065
     * @param Apache_Solr_Document $document optional document that will be used to generate post parameters (literal.* and boost.* params)
1066
     * @param string $mimetype optional mimetype specification (for the file being extracted)
1067
     *
1068
     * @return Apache_Solr_Response
1069
     *
1070
     * @throws Apache_Solr_InvalidArgumentException if $file, $params, or $document are invalid.
1071
     *
1072
     * @todo Should be using multipart/form-data to post parameter values, but I could not get my implementation to work. Needs revisisted.
1073
     */
1074
    public function extractFromString(
1075
        $data,
1076
        $params = array(),
1077
        $document = null,
1078
        $mimetype = 'application/octet-stream'
1079
    ) {
1080
        // check if $params is an array (allow null for default empty array)
1081
        if (!is_null($params)) {
1082
            if (!is_array($params)) {
1083
                throw new Apache_Solr_InvalidArgumentException("\$params must be a valid array or null");
1084
            }
1085
        } else {
1086
            $params = array();
1087
        }
1088
1089
        // make sure we receive our response in JSON and have proper name list treatment
1090
        $params['wt'] = self::SOLR_WRITER;
1091
        $params['json.nl'] = $this->_namedListTreatment;
1092
1093
        // check if $document is an Apache_Solr_Document instance
1094
        if (!is_null($document) && $document instanceof Apache_Solr_Document) {
1095
            // iterate document, adding literal.* and boost.* fields to $params as appropriate
1096
            foreach ($document as $field => $fieldValue) {
1097
                // check if we need to add a boost.* parameters
1098
                $fieldBoost = $document->getFieldBoost($field);
1099
1100
                if ($fieldBoost !== false) {
1101
                    $params["boost.{$field}"] = $fieldBoost;
1102
                }
1103
1104
                // add the literal.* parameter
1105
                $params["literal.{$field}"] = $fieldValue;
1106
            }
1107
        }
1108
1109
        // params will be sent to SOLR in the QUERY STRING
1110
        $queryString = $this->_generateQueryString($params);
1111
1112
        // the file contents will be sent to SOLR as the POST BODY - we use application/octect-stream as default mimetype
1113
        return $this->_sendRawPost($this->_extractUrl . $this->_queryDelimiter . $queryString,
1114
            $data, false, $mimetype);
1115
    }
1116
1117
    protected function _generateQueryString($params)
1118
    {
1119
        // use http_build_query to encode our arguments because its faster
1120
        // than urlencoding all the parts ourselves in a loop
1121
        //
1122
        // because http_build_query treats arrays differently than we want to, correct the query
1123
        // string by changing foo[#]=bar (# being an actual number) parameter strings to just
1124
        // multiple foo=bar strings. This regex should always work since '=' will be urlencoded
1125
        // anywhere else the regex isn't expecting it
1126
        //
1127
        // NOTE: before php 5.1.3 brackets were not url encoded by http_build query - we've checked
1128
        // the php version in the constructor and put the results in the instance variable. Also, before
1129
        // 5.1.2 the arg_separator parameter was not available, so don't use it
1130
        if ($this->_queryBracketsEscaped) {
1131
            $queryString = http_build_query($params, null,
1132
                $this->_queryStringDelimiter);
1133
            return preg_replace('/%5B(?:[0-9]|[1-9][0-9]+)%5D=/', '=',
1134
                $queryString);
1135
        } else {
1136
            $queryString = http_build_query($params);
1137
            return preg_replace('/\\[(?:[0-9]|[1-9][0-9]+)\\]=/', '=',
1138
                $queryString);
1139
        }
1140
    }
1141
1142
    /**
1143
     * Send an optimize command.  Will be synchronous unless both wait parameters are set
1144
     * to false.
1145
     *
1146
     * @param boolean $waitFlush Not used, will be removed with EXT:solr version 4.0
1147
     * @param boolean $waitSearcher
1148
     * @param float $timeout Maximum expected duration of the commit operation on the server (otherwise, will throw a communication exception)
1149
     * @return Apache_Solr_Response
1150
     *
1151
     * @throws Apache_Solr_HttpTransportException If an error occurs during the service call
1152
     */
1153
    public function optimize(
1154
        $waitFlush = true,
1155
        $waitSearcher = true,
1156
        $timeout = 3600
1157
    ) {
1158
        $searcherValue = $waitSearcher ? 'true' : 'false';
1159
1160
        $rawPost = '<optimize waitSearcher="' . $searcherValue . '" />';
1161
1162
        return $this->_sendRawPost($this->_updateUrl, $rawPost, $timeout);
1163
    }
1164
1165
    /**
1166
     * Simple Search interface
1167
     *
1168
     * @param string $query The raw query string
1169
     * @param int $offset The starting offset for result documents
1170
     * @param int $limit The maximum number of result documents to return
1171
     * @param array $params key / value pairs for other query parameters (see Solr documentation), use arrays for parameter keys used more than once (e.g. facet.field)
1172
     * @param string $method The HTTP method (Apache_Solr_Service::METHOD_GET or Apache_Solr_Service::METHOD::POST)
1173
     * @return Apache_Solr_Response
1174
     *
1175
     * @throws Apache_Solr_HttpTransportException If an error occurs during the service call
1176
     * @throws Apache_Solr_InvalidArgumentException If an invalid HTTP method is used
1177
     */
1178
    public function search(
1179
        $query,
1180
        $offset = 0,
1181
        $limit = 10,
1182
        $params = array(),
1183
        $method = self::METHOD_GET
1184
    ) {
1185
        // ensure params is an array
1186
        if (!is_null($params)) {
1187
            if (!is_array($params)) {
1188
                // params was specified but was not an array - invalid
1189
                throw new Apache_Solr_InvalidArgumentException("\$params must be a valid array or null");
1190
            }
1191
        } else {
1192
            $params = array();
1193
        }
1194
1195
        // construct our full parameters
1196
1197
        // common parameters in this interface
1198
        $params['wt'] = self::SOLR_WRITER;
1199
        $params['json.nl'] = $this->_namedListTreatment;
1200
1201
        $params['q'] = $query;
1202
        $params['start'] = $offset;
1203
        $params['rows'] = $limit;
1204
1205
        $queryString = $this->_generateQueryString($params);
1206
1207
        if ($method == self::METHOD_GET) {
1208
            return $this->_sendRawGet($this->_searchUrl . $this->_queryDelimiter . $queryString);
1209
        } else {
1210
            if ($method == self::METHOD_POST) {
1211
                return $this->_sendRawPost($this->_searchUrl, $queryString,
1212
                    false, 'application/x-www-form-urlencoded; charset=UTF-8');
1213
            } else {
1214
                throw new Apache_Solr_InvalidArgumentException("Unsupported method '$method', please use the Apache_Solr_Service::METHOD_* constants");
1215
            }
1216
        }
1217
    }
1218
}
1219