| @@ -27,178 +27,178 @@ | ||
| 27 | 27 | * | 
| 28 | 28 | */ | 
| 29 | 29 |  class tx_crawler_domain_process_manager  { | 
| 30 | - /** | |
| 31 | - * @var $timeToLive integer | |
| 32 | - */ | |
| 33 | - private $timeToLive; | |
| 34 | - /** | |
| 35 | - * @var integer | |
| 36 | - */ | |
| 37 | - private $countInARun; | |
| 30 | + /** | |
| 31 | + * @var $timeToLive integer | |
| 32 | + */ | |
| 33 | + private $timeToLive; | |
| 34 | + /** | |
| 35 | + * @var integer | |
| 36 | + */ | |
| 37 | + private $countInARun; | |
| 38 | 38 | |
| 39 | - /** | |
| 40 | - * @var integer | |
| 41 | - */ | |
| 42 | - private $processLimit; | |
| 39 | + /** | |
| 40 | + * @var integer | |
| 41 | + */ | |
| 42 | + private $processLimit; | |
| 43 | 43 | |
| 44 | - /** | |
| 45 | - * @var $crawlerObj tx_crawler_lib | |
| 46 | - */ | |
| 47 | - private $crawlerObj; | |
| 44 | + /** | |
| 45 | + * @var $crawlerObj tx_crawler_lib | |
| 46 | + */ | |
| 47 | + private $crawlerObj; | |
| 48 | 48 | |
| 49 | - /** | |
| 50 | - * @var $queueRepository tx_crawler_domain_queue_repository | |
| 51 | - */ | |
| 52 | - private $queueRepository; | |
| 49 | + /** | |
| 50 | + * @var $queueRepository tx_crawler_domain_queue_repository | |
| 51 | + */ | |
| 52 | + private $queueRepository; | |
| 53 | 53 | |
| 54 | - /** | |
| 55 | - * @var tx_crawler_domain_process_repository | |
| 56 | - */ | |
| 57 | - private $processRepository; | |
| 54 | + /** | |
| 55 | + * @var tx_crawler_domain_process_repository | |
| 56 | + */ | |
| 57 | + private $processRepository; | |
| 58 | 58 | |
| 59 | - /** | |
| 60 | - * @var $verbose boolean | |
| 61 | - */ | |
| 62 | - private $verbose; | |
| 59 | + /** | |
| 60 | + * @var $verbose boolean | |
| 61 | + */ | |
| 62 | + private $verbose; | |
| 63 | 63 | |
| 64 | - /** | |
| 65 | - * the constructor | |
| 66 | - */ | |
| 67 | -	public function __construct() { | |
| 68 | - $this->processRepository = new tx_crawler_domain_process_repository(); | |
| 69 | - $this->queueRepository = new tx_crawler_domain_queue_repository(); | |
| 70 | -		$this->crawlerObj = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('tx_crawler_lib'); | |
| 71 | - $this->timeToLive = intval($this->crawlerObj->extensionSettings['processMaxRunTime']); | |
| 72 | - $this->countInARun = intval($this->crawlerObj->extensionSettings['countInARun']); | |
| 73 | - $this->processLimit = intval($this->crawlerObj->extensionSettings['processLimit']); | |
| 74 | - $this->verbose = intval($this->crawlerObj->extensionSettings['processVerbose']); | |
| 75 | - } | |
| 64 | + /** | |
| 65 | + * the constructor | |
| 66 | + */ | |
| 67 | +    public function __construct() { | |
| 68 | + $this->processRepository = new tx_crawler_domain_process_repository(); | |
| 69 | + $this->queueRepository = new tx_crawler_domain_queue_repository(); | |
| 70 | +        $this->crawlerObj = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('tx_crawler_lib'); | |
| 71 | + $this->timeToLive = intval($this->crawlerObj->extensionSettings['processMaxRunTime']); | |
| 72 | + $this->countInARun = intval($this->crawlerObj->extensionSettings['countInARun']); | |
| 73 | + $this->processLimit = intval($this->crawlerObj->extensionSettings['processLimit']); | |
| 74 | + $this->verbose = intval($this->crawlerObj->extensionSettings['processVerbose']); | |
| 75 | + } | |
| 76 | 76 | |
| 77 | - /** | |
| 78 | - * starts multiple processes | |
| 79 | - * | |
| 80 | - * @param integer $timeout | |
| 81 | - */ | |
| 82 | -	public function multiProcess($timeout) { | |
| 83 | - $this->verbose = true; // DEBUG | |
| 84 | - $timeout = 300; // DEBUG | |
| 77 | + /** | |
| 78 | + * starts multiple processes | |
| 79 | + * | |
| 80 | + * @param integer $timeout | |
| 81 | + */ | |
| 82 | +    public function multiProcess($timeout) { | |
| 83 | + $this->verbose = true; // DEBUG | |
| 84 | + $timeout = 300; // DEBUG | |
| 85 | 85 | |
| 86 | -		if ($this->processLimit <= 1) { | |
| 87 | -			throw new RuntimeException('To run crawler in multi process mode you have to configure the processLimit > 1.' . PHP_EOL); | |
| 88 | - } | |
| 86 | +        if ($this->processLimit <= 1) { | |
| 87 | +            throw new RuntimeException('To run crawler in multi process mode you have to configure the processLimit > 1.' . PHP_EOL); | |
| 88 | + } | |
| 89 | 89 | |
| 90 | - $pendingItemsStart = $this->queueRepository->countAllPendingItems(); | |
| 91 | - $itemReportLimit = 20; | |
| 92 | - $reportItemCount = $pendingItemsStart - $itemReportLimit; | |
| 93 | -		if ($this->verbose) { | |
| 94 | - $this->reportItemStatus(); | |
| 95 | - } | |
| 96 | - $this->startRequiredProcesses(); | |
| 97 | - $nextTimeOut = time() + $this->timeToLive; | |
| 98 | -		for ($i = 0; $i < $timeout; $i++) { | |
| 99 | - $currentPendingItems = $this->queueRepository->countAllPendingItems(); | |
| 100 | -			if ($this->startRequiredProcesses($this->verbose)) { | |
| 101 | - $nextTimeOut = time() + $this->timeToLive; | |
| 102 | - } | |
| 103 | -			if ($currentPendingItems == 0) { | |
| 104 | -				if ($this->verbose) { | |
| 105 | - echo 'Finished...' . chr(10); | |
| 106 | - } | |
| 107 | - break; | |
| 108 | - } | |
| 109 | -			if ($currentPendingItems < $reportItemCount) { | |
| 110 | -				if ($this->verbose) { | |
| 111 | - $this->reportItemStatus(); | |
| 112 | - } | |
| 113 | - $reportItemCount = $currentPendingItems - $itemReportLimit; | |
| 114 | - } | |
| 115 | - sleep(1); | |
| 116 | -			if ($nextTimeOut < time()) { | |
| 117 | -				$timedOutProcesses = $this->processRepository->findAll('', 'DESC', NULL, 0, 'ttl >' . $nextTimeOut); | |
| 118 | - $nextTimeOut = time() + $this->timeToLive; | |
| 119 | -				if ($this->verbose) { | |
| 120 | -					echo 'Cleanup' . implode(',', $timedOutProcesses->getProcessIds()) . chr(10); | |
| 121 | - } | |
| 122 | - $this->crawlerObj->CLI_releaseProcesses($timedOutProcesses->getProcessIds(), true); | |
| 123 | - } | |
| 124 | - } | |
| 125 | -		if ($currentPendingItems > 0 && $this->verbose) { | |
| 126 | - echo 'Stop with timeout' . chr(10); | |
| 127 | - } | |
| 128 | - } | |
| 90 | + $pendingItemsStart = $this->queueRepository->countAllPendingItems(); | |
| 91 | + $itemReportLimit = 20; | |
| 92 | + $reportItemCount = $pendingItemsStart - $itemReportLimit; | |
| 93 | +        if ($this->verbose) { | |
| 94 | + $this->reportItemStatus(); | |
| 95 | + } | |
| 96 | + $this->startRequiredProcesses(); | |
| 97 | + $nextTimeOut = time() + $this->timeToLive; | |
| 98 | +        for ($i = 0; $i < $timeout; $i++) { | |
| 99 | + $currentPendingItems = $this->queueRepository->countAllPendingItems(); | |
| 100 | +            if ($this->startRequiredProcesses($this->verbose)) { | |
| 101 | + $nextTimeOut = time() + $this->timeToLive; | |
| 102 | + } | |
| 103 | +            if ($currentPendingItems == 0) { | |
| 104 | +                if ($this->verbose) { | |
| 105 | + echo 'Finished...' . chr(10); | |
| 106 | + } | |
| 107 | + break; | |
| 108 | + } | |
| 109 | +            if ($currentPendingItems < $reportItemCount) { | |
| 110 | +                if ($this->verbose) { | |
| 111 | + $this->reportItemStatus(); | |
| 112 | + } | |
| 113 | + $reportItemCount = $currentPendingItems - $itemReportLimit; | |
| 114 | + } | |
| 115 | + sleep(1); | |
| 116 | +            if ($nextTimeOut < time()) { | |
| 117 | +                $timedOutProcesses = $this->processRepository->findAll('', 'DESC', NULL, 0, 'ttl >' . $nextTimeOut); | |
| 118 | + $nextTimeOut = time() + $this->timeToLive; | |
| 119 | +                if ($this->verbose) { | |
| 120 | +                    echo 'Cleanup' . implode(',', $timedOutProcesses->getProcessIds()) . chr(10); | |
| 121 | + } | |
| 122 | + $this->crawlerObj->CLI_releaseProcesses($timedOutProcesses->getProcessIds(), true); | |
| 123 | + } | |
| 124 | + } | |
| 125 | +        if ($currentPendingItems > 0 && $this->verbose) { | |
| 126 | + echo 'Stop with timeout' . chr(10); | |
| 127 | + } | |
| 128 | + } | |
| 129 | 129 | |
| 130 | - /** | |
| 131 | - * Reports curent Status of queue | |
| 132 | - */ | |
| 133 | -	protected function reportItemStatus() { | |
| 134 | - echo 'Pending:'.$this->queueRepository->countAllPendingItems().' / Assigned:'.$this->queueRepository->countAllAssignedPendingItems().chr(10); | |
| 135 | - } | |
| 130 | + /** | |
| 131 | + * Reports curent Status of queue | |
| 132 | + */ | |
| 133 | +    protected function reportItemStatus() { | |
| 134 | + echo 'Pending:'.$this->queueRepository->countAllPendingItems().' / Assigned:'.$this->queueRepository->countAllAssignedPendingItems().chr(10); | |
| 135 | + } | |
| 136 | 136 | |
| 137 | - /** | |
| 138 | - * according to the given count of pending items and the countInARun Setting this method | |
| 139 | - * starts more crawling processes | |
| 140 | - * @return boolean if processes are started | |
| 141 | - */ | |
| 142 | -	private function startRequiredProcesses() { | |
| 143 | - $ret = FALSE; | |
| 144 | - $currentProcesses= $this->processRepository->countActive(); | |
| 145 | - $availableProcessesCount = $this->processLimit-$currentProcesses; | |
| 146 | - $requiredProcessesCount = ceil($this->queueRepository->countAllUnassignedPendingItems() / $this->countInARun); | |
| 147 | - $startProcessCount = min(array($availableProcessesCount,$requiredProcessesCount)); | |
| 148 | -		if ($startProcessCount <= 0) { | |
| 149 | - return $ret; | |
| 150 | - } | |
| 151 | -		if ($startProcessCount && $this->verbose) { | |
| 152 | - echo 'Start '.$startProcessCount.' new processes (Running:'.$currentProcesses.')'; | |
| 153 | - } | |
| 154 | -		for($i=0;$i<$startProcessCount;$i++) { | |
| 155 | - usleep(100); | |
| 156 | -			if ($this->startProcess()) { | |
| 157 | -				if ($this->verbose) { | |
| 158 | - echo '.'; | |
| 159 | - $ret = TRUE; | |
| 160 | - } | |
| 161 | - } | |
| 162 | - } | |
| 163 | -		if ($this->verbose) { | |
| 164 | - echo chr(10); | |
| 165 | - } | |
| 166 | - return $ret; | |
| 167 | - } | |
| 137 | + /** | |
| 138 | + * according to the given count of pending items and the countInARun Setting this method | |
| 139 | + * starts more crawling processes | |
| 140 | + * @return boolean if processes are started | |
| 141 | + */ | |
| 142 | +    private function startRequiredProcesses() { | |
| 143 | + $ret = FALSE; | |
| 144 | + $currentProcesses= $this->processRepository->countActive(); | |
| 145 | + $availableProcessesCount = $this->processLimit-$currentProcesses; | |
| 146 | + $requiredProcessesCount = ceil($this->queueRepository->countAllUnassignedPendingItems() / $this->countInARun); | |
| 147 | + $startProcessCount = min(array($availableProcessesCount,$requiredProcessesCount)); | |
| 148 | +        if ($startProcessCount <= 0) { | |
| 149 | + return $ret; | |
| 150 | + } | |
| 151 | +        if ($startProcessCount && $this->verbose) { | |
| 152 | + echo 'Start '.$startProcessCount.' new processes (Running:'.$currentProcesses.')'; | |
| 153 | + } | |
| 154 | +        for($i=0;$i<$startProcessCount;$i++) { | |
| 155 | + usleep(100); | |
| 156 | +            if ($this->startProcess()) { | |
| 157 | +                if ($this->verbose) { | |
| 158 | + echo '.'; | |
| 159 | + $ret = TRUE; | |
| 160 | + } | |
| 161 | + } | |
| 162 | + } | |
| 163 | +        if ($this->verbose) { | |
| 164 | + echo chr(10); | |
| 165 | + } | |
| 166 | + return $ret; | |
| 167 | + } | |
| 168 | 168 | |
| 169 | - /** | |
| 170 | - * starts new process | |
| 171 | - * @throws Exception if no crawlerprocess was started | |
| 172 | - */ | |
| 173 | -	public function startProcess() { | |
| 174 | - $ttl = (time() + $this->timeToLive -1); | |
| 175 | - $current = $this->processRepository->countNotTimeouted($ttl); | |
| 176 | -		$completePath = '(' .escapeshellcmd($this->getCrawlerCliPath()) . ' &) > /dev/null'; | |
| 177 | -		if (system($completePath) === FALSE) { | |
| 178 | -			throw new Exception('could not start process!'); | |
| 179 | - } | |
| 180 | -		else { | |
| 181 | -			for ($i=0;$i<10;$i++) { | |
| 182 | -				if ($this->processRepository->countNotTimeouted($ttl) > $current) { | |
| 183 | - return true; | |
| 184 | - } | |
| 185 | - sleep(1); | |
| 186 | - } | |
| 187 | -			throw new Exception('Something went wrong: process did not appear within 10 seconds.'); | |
| 188 | - } | |
| 189 | - } | |
| 169 | + /** | |
| 170 | + * starts new process | |
| 171 | + * @throws Exception if no crawlerprocess was started | |
| 172 | + */ | |
| 173 | +    public function startProcess() { | |
| 174 | + $ttl = (time() + $this->timeToLive -1); | |
| 175 | + $current = $this->processRepository->countNotTimeouted($ttl); | |
| 176 | +        $completePath = '(' .escapeshellcmd($this->getCrawlerCliPath()) . ' &) > /dev/null'; | |
| 177 | +        if (system($completePath) === FALSE) { | |
| 178 | +            throw new Exception('could not start process!'); | |
| 179 | + } | |
| 180 | +        else { | |
| 181 | +            for ($i=0;$i<10;$i++) { | |
| 182 | +                if ($this->processRepository->countNotTimeouted($ttl) > $current) { | |
| 183 | + return true; | |
| 184 | + } | |
| 185 | + sleep(1); | |
| 186 | + } | |
| 187 | +            throw new Exception('Something went wrong: process did not appear within 10 seconds.'); | |
| 188 | + } | |
| 189 | + } | |
| 190 | 190 | |
| 191 | - /** | |
| 192 | - * Returns the path to start the crawler from the command line | |
| 193 | - * | |
| 194 | - * @return string | |
| 195 | - */ | |
| 196 | -	public function getCrawlerCliPath(){ | |
| 197 | - $phpPath = $this->crawlerObj->extensionSettings['phpPath'] . ' '; | |
| 198 | -		$pathToTypo3 	= rtrim(\TYPO3\CMS\Core\Utility\GeneralUtility::getIndpEnv('TYPO3_DOCUMENT_ROOT'), '/'); | |
| 199 | -		$pathToTypo3 	.= rtrim(\TYPO3\CMS\Core\Utility\GeneralUtility::getIndpEnv('TYPO3_SITE_PATH'), '/'); | |
| 200 | - $cliPart = '/typo3/cli_dispatch.phpsh crawler'; | |
| 201 | - return $phpPath.$pathToTypo3.$cliPart; | |
| 202 | - } | |
| 191 | + /** | |
| 192 | + * Returns the path to start the crawler from the command line | |
| 193 | + * | |
| 194 | + * @return string | |
| 195 | + */ | |
| 196 | +    public function getCrawlerCliPath(){ | |
| 197 | + $phpPath = $this->crawlerObj->extensionSettings['phpPath'] . ' '; | |
| 198 | +        $pathToTypo3 	= rtrim(\TYPO3\CMS\Core\Utility\GeneralUtility::getIndpEnv('TYPO3_DOCUMENT_ROOT'), '/'); | |
| 199 | +        $pathToTypo3 	.= rtrim(\TYPO3\CMS\Core\Utility\GeneralUtility::getIndpEnv('TYPO3_SITE_PATH'), '/'); | |
| 200 | + $cliPart = '/typo3/cli_dispatch.phpsh crawler'; | |
| 201 | + return $phpPath.$pathToTypo3.$cliPart; | |
| 202 | + } | |
| 203 | 203 | |
| 204 | 204 | } |