| Total Complexity | 42 |
| Total Lines | 302 |
| Duplicated Lines | 0 % |
| Changes | 1 | ||
| Bugs | 0 | Features | 0 |
Complex classes like DocumentXapianIndexer often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use DocumentXapianIndexer, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 18 | use Symfony\Component\HttpFoundation\RequestStack; |
||
| 19 | use Symfony\Component\Process\Process; |
||
| 20 | use Throwable; |
||
| 21 | use ZipArchive; |
||
| 22 | |||
| 23 | /** |
||
| 24 | * Handles Xapian indexing for CDocument entities. |
||
| 25 | */ |
||
| 26 | final class DocumentXapianIndexer |
||
| 27 | { |
||
| 28 | public function __construct( |
||
| 29 | private readonly XapianIndexService $xapianIndexService, |
||
| 30 | private readonly EntityManagerInterface $em, |
||
| 31 | private readonly SettingsManager $settingsManager, |
||
| 32 | private readonly DocumentRawTextExtractor $rawTextExtractor, |
||
| 33 | private readonly RequestStack $requestStack, |
||
| 34 | ) {} |
||
| 35 | |||
| 36 | /** |
||
| 37 | * Index a CDocument into Xapian. |
||
| 38 | * |
||
| 39 | * @return int|null Xapian document id or null when indexing is skipped |
||
| 40 | */ |
||
| 41 | public function indexDocument(CDocument $document): ?int |
||
| 42 | { |
||
| 43 | $resourceNode = $document->getResourceNode(); |
||
| 44 | |||
| 45 | $enabled = (string) $this->settingsManager->getSetting('search.search_enabled', true); |
||
| 46 | |||
| 47 | if ('true' !== $enabled) { |
||
| 48 | error_log('[Xapian] indexDocument: search is disabled, skipping indexing'); |
||
| 49 | return null; |
||
| 50 | } |
||
| 51 | |||
| 52 | if (!$resourceNode instanceof ResourceNode) { |
||
| 53 | error_log('[Xapian] indexDocument: missing ResourceNode, skipping'); |
||
| 54 | return null; |
||
| 55 | } |
||
| 56 | |||
| 57 | if ('folder' === $document->getFiletype()) { |
||
| 58 | error_log('[Xapian] indexDocument: skipping folder document, resource_node_id='.$resourceNode->getId()); |
||
| 59 | return null; |
||
| 60 | } |
||
| 61 | |||
| 62 | [$courseId, $sessionId, $courseRootNodeId] = $this->resolveCourseSessionAndRootNode($resourceNode); |
||
| 63 | |||
| 64 | $content = $this->rawTextExtractor->extract($document); |
||
| 65 | |||
| 66 | $fields = [ |
||
| 67 | 'title' => (string) $document->getTitle(), |
||
| 68 | 'description' => (string) ($document->getComment() ?? ''), |
||
| 69 | 'content' => $content, |
||
| 70 | 'filetype' => (string) $document->getFiletype(), |
||
| 71 | 'resource_node_id' => (string) $resourceNode->getId(), |
||
| 72 | 'course_id' => null !== $courseId ? (string) $courseId : '', |
||
| 73 | 'session_id' => null !== $sessionId ? (string) $sessionId : '', |
||
| 74 | 'course_root_node_id' => null !== $courseRootNodeId ? (string) $courseRootNodeId : '', |
||
| 75 | 'full_path' => $document->getFullPath(), |
||
| 76 | ]; |
||
| 77 | |||
| 78 | $terms = ['Tdocument']; |
||
| 79 | |||
| 80 | if (null !== $courseId) { |
||
| 81 | $terms[] = 'C'.$courseId; |
||
| 82 | } |
||
| 83 | if (null !== $sessionId) { |
||
| 84 | $terms[] = 'S'.$sessionId; |
||
| 85 | } |
||
| 86 | |||
| 87 | $this->applyPrefilterConfigToTerms($terms, $courseId, $sessionId, $document); |
||
| 88 | |||
| 89 | $resourceNodeId = (int) $resourceNode->getId(); |
||
| 90 | $resourceNodeRef = $this->em->getReference(ResourceNode::class, $resourceNodeId); |
||
| 91 | |||
| 92 | /** @var SearchEngineRef|null $existingRef */ |
||
| 93 | $existingRef = $this->em |
||
| 94 | ->getRepository(SearchEngineRef::class) |
||
| 95 | ->findOneBy(['resourceNode' => $resourceNodeRef]); |
||
| 96 | |||
| 97 | $existingDocId = $existingRef?->getSearchDid(); |
||
| 98 | |||
| 99 | if (null !== $existingDocId) { |
||
| 100 | try { |
||
| 101 | $this->xapianIndexService->deleteDocument($existingDocId); |
||
| 102 | } catch (Throwable $e) { |
||
| 103 | error_log('[Xapian] indexDocument: failed to delete previous docId='.$existingDocId.' error='.$e->getMessage()); |
||
| 104 | } |
||
| 105 | } |
||
| 106 | |||
| 107 | // Get raw input from request (might be keyed by code OR by field_id) |
||
| 108 | $rawInput = $this->extractSearchFieldValuesFromRequest(); |
||
| 109 | |||
| 110 | // Normalize into code => value (t/d/k/whatever) |
||
| 111 | $inputByCode = $this->normalizeSearchFieldValuesToCode($rawInput); |
||
| 112 | |||
| 113 | // Merge with stored values (stored wins only when request has nothing for that field) |
||
| 114 | $storedByCode = $this->fetchStoredSearchFieldValuesByCode($resourceNodeId); |
||
| 115 | |||
| 116 | // Request should override stored |
||
| 117 | $searchFieldValuesByCode = array_replace($storedByCode, $inputByCode); |
||
| 118 | |||
| 119 | // resolve language ISO for stemming (resource_file > resource_node) |
||
| 120 | $languageIso = $this->resolveLanguageIsoForResourceNode($resourceNode); |
||
| 121 | |||
| 122 | try { |
||
| 123 | // Pass language ISO to the index service (it will map ISO -> Xapian language) |
||
| 124 | $docId = $this->xapianIndexService->indexDocument( |
||
| 125 | $fields, |
||
| 126 | $terms, |
||
| 127 | $languageIso, |
||
| 128 | $searchFieldValuesByCode |
||
| 129 | ); |
||
| 130 | } catch (Throwable $e) { |
||
| 131 | error_log('[Xapian] indexDocument: Xapian indexing failed: '.$e->getMessage()); |
||
| 132 | return null; |
||
| 133 | } |
||
| 134 | |||
| 135 | if ($existingRef instanceof SearchEngineRef) { |
||
| 136 | $existingRef->setSearchDid($docId); |
||
| 137 | } else { |
||
| 138 | $existingRef = new SearchEngineRef(); |
||
| 139 | $existingRef->setResourceNode($resourceNodeRef); |
||
| 140 | $existingRef->setSearchDid($docId); |
||
| 141 | $this->em->persist($existingRef); |
||
| 142 | } |
||
| 143 | |||
| 144 | // Persist dynamic search field values (create/update) |
||
| 145 | $this->syncSearchEngineFieldValues($resourceNodeId, $document, $content); |
||
| 146 | |||
| 147 | $this->em->flush(); |
||
| 148 | |||
| 149 | return $docId; |
||
| 150 | } |
||
| 151 | |||
| 152 | public function deleteForResourceNodeId(int $resourceNodeId): void |
||
| 153 | { |
||
| 154 | $enabled = (string) $this->settingsManager->getSetting('search.search_enabled', true); |
||
| 155 | if ('true' !== $enabled) { |
||
| 156 | error_log('[Xapian] deleteForResourceNodeId: search is disabled, skipping'); |
||
| 157 | return; |
||
| 158 | } |
||
| 159 | |||
| 160 | try { |
||
| 161 | $this->em->getConnection()->executeStatement( |
||
| 162 | 'DELETE FROM search_engine_field_value WHERE resource_node_id = ?', |
||
| 163 | [$resourceNodeId] |
||
| 164 | ); |
||
| 165 | } catch (Throwable $e) { |
||
| 166 | error_log('[Xapian] deleteForResourceNodeId: failed to delete field values: '.$e->getMessage()); |
||
| 167 | } |
||
| 168 | |||
| 169 | $resourceNodeRef = $this->em->getReference(ResourceNode::class, $resourceNodeId); |
||
| 170 | |||
| 171 | /** @var SearchEngineRef|null $ref */ |
||
| 172 | $ref = $this->em |
||
| 173 | ->getRepository(SearchEngineRef::class) |
||
| 174 | ->findOneBy(['resourceNode' => $resourceNodeRef]); |
||
| 175 | |||
| 176 | if (!$ref instanceof SearchEngineRef) { |
||
| 177 | error_log('[Xapian] deleteForResourceNodeId: no SearchEngineRef found, nothing to delete'); |
||
| 178 | return; |
||
| 179 | } |
||
| 180 | |||
| 181 | $docId = $ref->getSearchDid(); |
||
| 182 | if (null !== $docId) { |
||
| 183 | try { |
||
| 184 | $this->xapianIndexService->deleteDocument($docId); |
||
| 185 | } catch (Throwable $e) { |
||
| 186 | error_log('[Xapian] deleteForResourceNodeId: deleteDocument failed for did='.$docId.' error='.$e->getMessage()); |
||
| 187 | } |
||
| 188 | } |
||
| 189 | |||
| 190 | $this->em->remove($ref); |
||
| 191 | $this->em->flush(); |
||
| 192 | } |
||
| 193 | |||
| 194 | /** |
||
| 195 | * Persist search_engine_field_value dynamically based on values sent by UI/API. |
||
| 196 | * |
||
| 197 | * Accepts: |
||
| 198 | * - multipart: searchFieldValues[t]=..., searchFieldValues[d]=... |
||
| 199 | * - multipart: searchFieldValues as JSON string {"t":"..."} |
||
| 200 | * - legacy/alt: searchFieldValues as array keyed by field id (1,2,3) |
||
| 201 | */ |
||
| 202 | private function syncSearchEngineFieldValues(int $resourceNodeId, CDocument $document, string $content): void |
||
| 203 | { |
||
| 204 | $conn = $this->em->getConnection(); |
||
| 205 | |||
| 206 | $maps = $this->fetchSearchEngineFields($conn); |
||
| 207 | $byCode = $maps['byCode']; |
||
| 208 | $byId = $maps['byId']; |
||
| 209 | |||
| 210 | if (empty($byCode)) { |
||
| 211 | error_log('[Xapian] syncSearchEngineFieldValues: no search_engine_field rows found, skipping'); |
||
| 212 | return; |
||
| 213 | } |
||
| 214 | |||
| 215 | // Raw values from request (could be keyed by code OR id) |
||
| 216 | $rawValues = $this->extractSearchFieldValuesFromRequest(); |
||
| 217 | $hasExplicitInput = \is_array($rawValues) && \count($rawValues) > 0; |
||
| 218 | |||
| 219 | // If we didn't receive anything, do NOT overwrite existing values on update. |
||
| 220 | // This prevents accidental resets when the request does not carry searchFieldValues. |
||
| 221 | try { |
||
| 222 | $existingCount = (int) $conn->fetchOne( |
||
| 223 | 'SELECT COUNT(*) FROM search_engine_field_value WHERE resource_node_id = ?', |
||
| 224 | [$resourceNodeId] |
||
| 225 | ); |
||
| 226 | } catch (Throwable $e) { |
||
| 227 | $existingCount = 0; |
||
| 228 | } |
||
| 229 | |||
| 230 | if (!$hasExplicitInput && $existingCount > 0) { |
||
| 231 | error_log( |
||
| 232 | '[Xapian] syncSearchEngineFieldValues: no input received, keeping existing values for resource_node_id='.$resourceNodeId |
||
| 233 | ); |
||
| 234 | return; |
||
| 235 | } |
||
| 236 | |||
| 237 | // Normalize into field_id => value |
||
| 238 | $valuesByFieldId = []; |
||
| 239 | |||
| 240 | foreach ($rawValues as $key => $val) { |
||
| 241 | // NOTE: keep explicit empty strings to allow "clear", |
||
| 242 | // but skip when building inserts |
||
| 243 | $value = (string) $val; |
||
| 244 | |||
| 245 | $fieldId = null; |
||
| 246 | |||
| 247 | if (is_numeric((string) $key)) { |
||
| 248 | $id = (int) $key; |
||
| 249 | if (isset($byId[$id])) { |
||
| 250 | $fieldId = $id; |
||
| 251 | } |
||
| 252 | } else { |
||
| 253 | $code = strtolower(trim((string) $key)); |
||
| 254 | if (isset($byCode[$code])) { |
||
| 255 | $fieldId = (int) $byCode[$code]['id']; |
||
| 256 | } |
||
| 257 | } |
||
| 258 | |||
| 259 | if (null === $fieldId) { |
||
| 260 | continue; |
||
| 261 | } |
||
| 262 | |||
| 263 | $valuesByFieldId[$fieldId] = trim($value); |
||
| 264 | } |
||
| 265 | |||
| 266 | // Conservative fallback: only fill missing ones for known semantics (t/d/c) |
||
| 267 | foreach ($byCode as $code => $meta) { |
||
| 268 | $fid = (int) $meta['id']; |
||
| 269 | if (isset($valuesByFieldId[$fid])) { |
||
| 270 | continue; |
||
| 271 | } |
||
| 272 | |||
| 273 | $fallback = $this->guessFallbackValue( |
||
| 274 | (string) $code, |
||
| 275 | (string) ($meta['title'] ?? ''), |
||
| 276 | $document, |
||
| 277 | $content |
||
| 278 | ); |
||
| 279 | |||
| 280 | if (null !== $fallback) { |
||
| 281 | $fallback = trim($fallback); |
||
| 282 | if ('' !== $fallback) { |
||
| 283 | $valuesByFieldId[$fid] = $fallback; |
||
| 284 | } |
||
| 285 | } |
||
| 286 | } |
||
| 287 | |||
| 288 | try { |
||
| 289 | $conn->executeStatement( |
||
| 290 | 'DELETE FROM search_engine_field_value WHERE resource_node_id = ?', |
||
| 291 | [$resourceNodeId] |
||
| 292 | ); |
||
| 293 | |||
| 294 | foreach ($valuesByFieldId as $fid => $value) { |
||
| 295 | $conn->insert('search_engine_field_value', [ |
||
| 296 | 'resource_node_id' => $resourceNodeId, |
||
| 297 | 'field_id' => (int) $fid, |
||
| 298 | 'value' => (string) $value, |
||
| 299 | ]); |
||
| 300 | } |
||
| 301 | } catch (Throwable $e) { |
||
| 302 | error_log('[Xapian] syncSearchEngineFieldValues: failed: '.$e->getMessage()); |
||
| 303 | } |
||
| 304 | } |
||
| 305 | |||
| 306 | /** |
||
| 307 | * @return array{ |
||
| 308 | * byCode: array<string, array{id:int,title:string}>, |
||
| 309 | * byId: array<int, array{code:string,title:string}> |
||
| 310 | * } |
||
| 311 | */ |
||
| 312 | private function fetchSearchEngineFields(Connection $conn): array |
||
| 313 | { |
||
| 314 | try { |
||
| 315 | $rows = $conn->fetchAllAssociative('SELECT id, code, title FROM search_engine_field'); |
||
| 316 | } catch (Throwable $e) { |
||
| 317 | error_log('[Xapian] fetchSearchEngineFields: query failed: '.$e->getMessage()); |
||
| 318 | return ['byCode' => [], 'byId' => []]; |
||
| 319 | } |
||
| 320 | |||
| 907 |