1
|
|
|
import os |
|
|
|
|
2
|
|
|
import logging |
3
|
|
|
import gzip |
4
|
|
|
|
5
|
|
|
from six import text_type |
6
|
|
|
from topik.fileio._registry import register_input |
7
|
|
|
from topik.fileio.tests import test_data_path |
|
|
|
|
8
|
|
|
|
9
|
|
|
@register_input |
|
|
|
|
10
|
|
|
def read_document_folder(folder, content_field='text'): |
11
|
|
|
"""Iterate over the files in a folder to retrieve the content to process and tokenize. |
12
|
|
|
|
13
|
|
|
Parameters |
14
|
|
|
---------- |
15
|
|
|
folder : str |
16
|
|
|
The folder containing the files you want to analyze. |
17
|
|
|
|
18
|
|
|
content_field : str |
19
|
|
|
The usage of 'content_field' in this source is different from most other sources. The |
|
|
|
|
20
|
|
|
assumption in this source is that each file contains raw text, NOT dictionaries of |
|
|
|
|
21
|
|
|
categorized data. The content_field argument here specifies what key to store the raw |
22
|
|
|
text under in the returned dictionary for each document. |
23
|
|
|
|
24
|
|
|
Examples |
25
|
|
|
-------- |
26
|
|
|
>>> documents = read_document_folder( |
27
|
|
|
... '{}/test_data_folder_files'.format(test_data_path)) |
28
|
|
|
>>> next(documents)['text'] == ( |
29
|
|
|
... u"'Interstellar' was incredible. The visuals, the score, " + |
30
|
|
|
... u"the acting, were all amazing. The plot is definitely one " + |
31
|
|
|
... u"of the most original I've seen in a while.") |
32
|
|
|
True |
33
|
|
|
""" |
34
|
|
|
|
35
|
|
|
if not os.path.exists(folder): |
36
|
|
|
raise IOError("Folder not found!") |
37
|
|
|
|
38
|
|
|
for directory, subdirectories, files in os.walk(folder): |
|
|
|
|
39
|
|
|
for n, file in enumerate(sorted(files)): |
|
|
|
|
40
|
|
|
_open = gzip.open if file.endswith('.gz') else open |
|
|
|
|
41
|
|
|
fullpath = os.path.join(directory, file) |
42
|
|
|
try: |
43
|
|
|
with _open(fullpath, 'rb') as fd: |
|
|
|
|
44
|
|
|
yield _process_file(fd, fullpath, content_field) |
45
|
|
|
except ValueError as err: |
|
|
|
|
46
|
|
|
logging.warning("Unable to process file: {}, error: {}".format(fullpath, err)) |
47
|
|
|
|
48
|
|
|
|
49
|
|
|
def _process_file(fd, fullpath, content_field): |
|
|
|
|
50
|
|
|
content = fd.read() |
51
|
|
|
try: |
52
|
|
|
u_content = text_type(content) |
53
|
|
|
except UnicodeDecodeError: |
|
|
|
|
54
|
|
|
logging.warning("Encountered invalid unicode in file {}, ignoring invalid bytes".format(fullpath)) |
55
|
|
|
u_content = text_type(content, errors='ignore') |
56
|
|
|
return {content_field: u_content, 'filename': fullpath} |
57
|
|
|
|
The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods:
If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions.