| 1 |  |  | import glob | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 2 |  |  | import os | 
            
                                                                                                            
                            
            
                                    
            
            
                | 3 |  |  | import time | 
            
                                                                                                            
                            
            
                                    
            
            
                | 4 |  |  | import unittest | 
            
                                                                                                            
                            
            
                                    
            
            
                | 5 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 6 |  |  | import elasticsearch | 
            
                                                                                                            
                            
            
                                    
            
            
                | 7 |  |  | import nose.tools as nt | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 8 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 9 |  |  | from topik.fileio import TopikProject | 
            
                                                                                                            
                            
            
                                    
            
            
                | 10 |  |  | from topik.fileio.tests import test_data_path | 
            
                                                                                                            
                            
            
                                    
            
            
                | 11 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 12 |  |  | # make logging quiet during testing, to keep Travis CI logs short. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 13 |  |  | import logging | 
            
                                                                                                            
                            
            
                                    
            
            
                | 14 |  |  | logging.basicConfig() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 15 |  |  | logging.getLogger('elasticsearch').setLevel(logging.ERROR) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 16 |  |  | logging.getLogger('urllib3').setLevel(logging.ERROR) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 17 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 18 |  |  | SAVE_FILENAME = "test_project" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 19 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 20 |  |  | sample_tokenized_doc = (2318580746137828354, | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 21 |  |  |  [u'nano', u'sized', u'tio', u'particles', u'applications', u'including', | 
            
                                                                                                            
                            
            
                                    
            
            
                | 22 |  |  |   u'use', u'photocatalysts', u'heat', u'transfer', u'fluids', u'nanofluids', | 
            
                                                                                                            
                            
            
                                    
            
            
                | 23 |  |  |   u'present', u'study', u'tio', u'nanoparticles', u'controllable', u'phase', | 
            
                                                                                                            
                            
            
                                    
            
            
                | 24 |  |  |   u'particle', u'size', u'obtained', u'homogeneous', u'gas', u'phase', | 
            
                                                                                                            
                            
            
                                    
            
            
                | 25 |  |  |   u'nucleation', u'chemical', u'vapor', u'condensation', u'cvc', u'phase', | 
            
                                                                                                            
                            
            
                                    
            
            
                | 26 |  |  |   u'particle', u'size', u'tio', u'nanoparticles', u'processing', u'conditions', | 
            
                                                                                                            
                            
            
                                    
            
            
                | 27 |  |  |   u'characterized', u'x', u'ray', u'diffraction', u'transmission', u'electron', | 
            
                                                                                                            
                            
            
                                    
            
            
                | 28 |  |  |   u'microscopy', u'chamber', u'temperature', u'pressure', u'key', u'parameters', | 
            
                                                                                                            
                            
            
                                    
            
            
                | 29 |  |  |   u'affecting', u'particle', u'phase', u'size', u'pure', u'anatase', u'phase', | 
            
                                                                                                            
                            
            
                                    
            
            
                | 30 |  |  |   u'observed', u'synthesis', u'temperatures', u'low', u'c', u'chamber', | 
            
                                                                                                            
                            
            
                                    
            
            
                | 31 |  |  |   u'pressure', u'varying', u'torr', u'furnace', u'temperature', u'increased', | 
            
                                                                                                            
                            
            
                                    
            
            
                | 32 |  |  |   u'c', u'pressure', u'torr', u'mixture', u'anatase', u'rutile', u'phases', | 
            
                                                                                                            
                            
            
                                    
            
            
                | 33 |  |  |   u'observed', u'predominant', u'phase', u'anatase', u'average', u'particle', | 
            
                                                                                                            
                            
            
                                    
            
            
                | 34 |  |  |   u'size', u'experimental', u'conditions', u'observed', u'nm']) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 35 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 36 |  |  | test_data_path = os.path.join(test_data_path, "test_data_json_stream.json") | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 37 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 38 |  |  | class ProjectTest(object): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 39 |  |  |     def test_context_manager(self): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 40 |  |  |         for filename in glob.glob("context_output*"): | 
            
                                                                        
                            
            
                                    
            
            
                | 41 |  |  |             os.remove(filename) | 
            
                                                                        
                            
            
                                    
            
            
                | 42 |  |  |         with TopikProject("context_output", self.output_type, self.output_args) as project: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                            
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 43 |  |  |             project.read_input(source=test_data_path, content_field='abstract') | 
            
                                                                        
                            
            
                                    
            
            
                | 44 |  |  |             project.tokenize() | 
            
                                                                        
                            
            
                                    
            
            
                | 45 |  |  |             project.vectorize(method='bag_of_words') | 
            
                                                                        
                            
            
                                    
            
            
                | 46 |  |  |             project.run_model(model_name='lda', ntopics=2) | 
            
                                                                        
                            
            
                                    
            
            
                | 47 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 48 |  |  |         # above runs through a whole workflow (minus plotting.)  At end, it closes file. | 
            
                                                                        
                            
            
                                    
            
            
                | 49 |  |  |         # load output here. | 
            
                                                                        
                            
            
                                    
            
            
                | 50 |  |  |         with TopikProject("context_output") as project: | 
            
                                                                        
                            
            
                                    
            
            
                | 51 |  |  |             nt.assert_equal(len(list(project.get_filtered_corpus_iterator())), 100) | 
            
                                                                        
                            
            
                                    
            
            
                | 52 |  |  |             nt.assert_true(sample_tokenized_doc in list(iter(project.selected_tokenized_corpus))) | 
            
                                                                        
                            
            
                                    
            
            
                | 53 |  |  |             nt.assert_equal(project.selected_vectorized_corpus.global_term_count, 2434) | 
            
                                                                        
                            
            
                                    
            
            
                | 54 |  |  |             nt.assert_equal(len(project.selected_vectorized_corpus), 100)  # All documents processed | 
            
                                                                        
                            
            
                                    
            
            
                | 55 |  |  |             for doc in project.selected_modeled_corpus.doc_topic_matrix.values(): | 
            
                                                                        
                            
            
                                    
            
            
                | 56 |  |  |                 nt.assert_almost_equal(sum(doc), 1) | 
            
                                                                        
                            
            
                                    
            
            
                | 57 |  |  |             for topic in project.selected_modeled_corpus.topic_term_matrix.values(): | 
            
                                                                        
                            
            
                                    
            
            
                | 58 |  |  |                 nt.assert_almost_equal(sum(topic), 1) | 
            
                                                                        
                            
            
                                    
            
            
                | 59 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 60 |  |  |         for filename in glob.glob("context_output*"): | 
            
                                                                        
                            
            
                                    
            
            
                | 61 |  |  |             os.remove(filename) | 
            
                                                                        
                            
            
                                    
            
            
                | 62 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 63 |  |  |     def test_read_input(self): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 64 |  |  |         nt.assert_equal(len(list(self.project.get_filtered_corpus_iterator())), 100) | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 65 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 66 |  |  |     def test_get_filtered_corpus_iterator(self): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                            
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 67 |  |  |         doc_list = list(self.project.get_filtered_corpus_iterator()) | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 68 |  |  |         nt.assert_equal(type(doc_list[0]), type(('123', 'text'))) | 
            
                                                                        
                            
            
                                    
            
            
                | 69 |  |  |         nt.assert_equal(len(doc_list), 100) | 
            
                                                                        
                            
            
                                    
            
            
                | 70 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 71 |  |  |     def test_get_date_filtered_corpus_iterator(self): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                            
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 72 |  |  |         results = list(self.project.get_date_filtered_corpus_iterator( | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 73 |  |  |             field_to_get="abstract", start=1975, end=1999, filter_field='year')) | 
            
                                                                        
                            
            
                                    
            
            
                | 74 |  |  |         nt.assert_equal(len(results), 25) | 
            
                                                                        
                            
            
                                    
            
            
                | 75 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 76 |  |  |     def test_tokenize(self): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 77 |  |  |         self.project.tokenize('simple') | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 78 |  |  |         in_results = False | 
            
                                                                        
                            
            
                                    
            
            
                | 79 |  |  |         for id, doc in self.project.selected_tokenized_corpus: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                            
                                                                                            
                                                                                            
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 80 |  |  |             if doc in sample_tokenized_doc: | 
            
                                                                        
                            
            
                                    
            
            
                | 81 |  |  |                 in_results = True | 
            
                                                                        
                            
            
                                    
            
            
                | 82 |  |  |                 break | 
            
                                                                        
                            
            
                                    
            
            
                | 83 |  |  |         nt.assert_true(in_results) | 
            
                                                                        
                            
            
                                    
            
            
                | 84 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 85 |  |  |     def test_vectorize(self): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 86 |  |  |         self.project.tokenize() | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 87 |  |  |         self.project.vectorize() | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 88 |  |  |         nt.assert_equal(self.project.selected_vectorized_corpus.global_term_count, 2434) | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 89 |  |  |         nt.assert_equal(len(self.project.selected_vectorized_corpus), 100)  # All documents processed | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 90 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 91 |  |  |     def test_model(self): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 92 |  |  |         self.project.tokenize() | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 93 |  |  |         self.project.vectorize() | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 94 |  |  |         self.project.run_model(model_name='lda', ntopics=2) | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 95 |  |  |         for doc in self.project.selected_modeled_corpus.doc_topic_matrix.values(): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 96 |  |  |             nt.assert_almost_equal(sum(doc), 1) | 
            
                                                                        
                            
            
                                    
            
            
                | 97 |  |  |         for topic in self.project.selected_modeled_corpus.topic_term_matrix.values(): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 98 |  |  |             nt.assert_almost_equal(sum(topic), 1) | 
            
                                                                        
                            
            
                                    
            
            
                | 99 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 100 |  |  |     def test_visualize(self): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 101 |  |  |         self.project.tokenize() | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 102 |  |  |         self.project.vectorize(method='bag_of_words') | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 103 |  |  |         self.project.run_model(ntopics=2) | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 104 |  |  |         self.project.visualize(vis_name='termite', topn=5) | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 105 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 106 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 107 |  |  | class TestInMemoryOutput(unittest.TestCase, ProjectTest): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 108 |  |  |     def setUp(self): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 109 |  |  |         self.output_type = "InMemoryOutput" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 110 |  |  |         self.output_args = {} | 
            
                                                                                                            
                            
            
                                    
            
            
                | 111 |  |  |         self.project = TopikProject("test_project", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 112 |  |  |                                     output_type=self.output_type, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 113 |  |  |                                     output_args=self.output_args) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 114 |  |  |         self.project.read_input(test_data_path, content_field="abstract") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 115 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 116 |  |  | class TestElasticSearchOutput(unittest.TestCase, ProjectTest): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 117 |  |  |     INDEX = "test_index" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 118 |  |  |     def setUp(self): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 119 |  |  |         self.output_type = "ElasticSearchOutput" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 120 |  |  |         self.output_args = {'source': 'localhost', | 
            
                                                                                                            
                            
            
                                    
            
            
                | 121 |  |  |                             'index': TestElasticSearchOutput.INDEX, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 122 |  |  |                             'content_field': "abstract"} | 
            
                                                                                                            
                            
            
                                    
            
            
                | 123 |  |  |         self.project = TopikProject("test_project", output_type=self.output_type, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 124 |  |  |                                     output_args=self.output_args) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 125 |  |  |         self.project.read_input(test_data_path, content_field="abstract", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 126 |  |  |                                 synchronous_wait=30) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 127 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 128 |  |  |     def tearDown(self): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 129 |  |  |         instance = elasticsearch.Elasticsearch("localhost") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 130 |  |  |         instance.indices.delete(TestElasticSearchOutput.INDEX) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 131 |  |  |         if instance.indices.exists("{}_year_alias_date".format(TestElasticSearchOutput.INDEX)): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 132 |  |  |             instance.indices.delete("{}_year_alias_date".format(TestElasticSearchOutput.INDEX)) | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 133 |  |  |         time.sleep(1) | 
            
                                                        
            
                                    
            
            
                | 134 |  |  |  | 
            
                        
The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods:
If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions.