1 | #! /usr/bin/env python2 |
||
2 | # -*- coding: utf-8 -*- |
||
3 | # Originally written by Barry Warsaw <[email protected]> |
||
4 | # |
||
5 | # Minimally patched to make it even more xgettext compatible |
||
6 | # by Peter Funk <[email protected]> |
||
7 | |||
8 | """pygettext -- Python equivalent of xgettext(1) |
||
9 | |||
10 | Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the |
||
11 | internationalization of C programs. Most of these tools are independent of |
||
12 | the programming language and can be used from within Python programs. Martin |
||
13 | von Loewis' work[1] helps considerably in this regard. |
||
14 | |||
15 | There's one problem though; xgettext is the program that scans source code |
||
16 | looking for message strings, but it groks only C (or C++). Python introduces |
||
17 | a few wrinkles, such as dual quoting characters, triple quoted strings, and |
||
18 | raw strings. xgettext understands none of this. |
||
19 | |||
20 | Enter pygettext, which uses Python's standard tokenize module to scan Python |
||
21 | source code, generating .pot files identical to what GNU xgettext[2] generates |
||
22 | for C and C++ code. From there, the standard GNU tools can be used. |
||
23 | |||
24 | A word about marking Python strings as candidates for translation. GNU |
||
25 | xgettext recognizes the following keywords: gettext, dgettext, dcgettext, and |
||
26 | gettext_noop. But those can be a lot of text to include all over your code. |
||
27 | C and C++ have a trick: they use the C preprocessor. Most internationalized C |
||
28 | source includes a #define for gettext() to _() so that what has to be written |
||
29 | in the source is much less. Thus these are both translatable strings: |
||
30 | |||
31 | gettext("Translatable String") |
||
32 | _("Translatable String") |
||
33 | |||
34 | Python of course has no preprocessor so this doesn't work so well. Thus, |
||
35 | pygettext searches only for _() by default, but see the -k/--keyword flag |
||
36 | below for how to augment this. |
||
37 | |||
38 | [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html |
||
39 | [2] http://www.gnu.org/software/gettext/gettext.html |
||
40 | |||
41 | NOTE: pygettext attempts to be option and feature compatible with GNU xgettext |
||
42 | where ever possible. However some options are still missing or are not fully |
||
43 | implemented. Also, xgettext's use of command line switches with option |
||
44 | arguments is broken, and in these cases, pygettext just defines additional |
||
45 | switches. |
||
46 | |||
47 | Usage: pygettext [options] inputfile ... |
||
48 | |||
49 | Options: |
||
50 | |||
51 | -a |
||
52 | --extract-all |
||
53 | Extract all strings. |
||
54 | |||
55 | -d name |
||
56 | --default-domain=name |
||
57 | Rename the default output file from messages.pot to name.pot. |
||
58 | |||
59 | -E |
||
60 | --escape |
||
61 | Replace non-ASCII characters with octal escape sequences. |
||
62 | |||
63 | -D |
||
64 | --docstrings |
||
65 | Extract module, class, method, and function docstrings. These do not |
||
66 | need to be wrapped in _() markers, and in fact cannot be for Python to |
||
67 | consider them docstrings. (See also the -X option). |
||
68 | |||
69 | -h |
||
70 | --help |
||
71 | Print this help message and exit. |
||
72 | |||
73 | -k word |
||
74 | --keyword=word |
||
75 | Keywords to look for in addition to the default set, which are: |
||
76 | %(DEFAULTKEYWORDS)s |
||
77 | |||
78 | You can have multiple -k flags on the command line. |
||
79 | |||
80 | -K |
||
81 | --no-default-keywords |
||
82 | Disable the default set of keywords (see above). Any keywords |
||
83 | explicitly added with the -k/--keyword option are still recognized. |
||
84 | |||
85 | --no-location |
||
86 | Do not write filename/lineno location comments. |
||
87 | |||
88 | -n |
||
89 | --add-location |
||
90 | Write filename/lineno location comments indicating where each |
||
91 | extracted string is found in the source. These lines appear before |
||
92 | each msgid. The style of comments is controlled by the -S/--style |
||
93 | option. This is the default. |
||
94 | |||
95 | -o filename |
||
96 | --output=filename |
||
97 | Rename the default output file from messages.pot to filename. If |
||
98 | filename is `-' then the output is sent to standard out. |
||
99 | |||
100 | -p dir |
||
101 | --output-dir=dir |
||
102 | Output files will be placed in directory dir. |
||
103 | |||
104 | -S stylename |
||
105 | --style stylename |
||
106 | Specify which style to use for location comments. Two styles are |
||
107 | supported: |
||
108 | |||
109 | Solaris # File: filename, line: line-number |
||
110 | GNU #: filename:line |
||
111 | |||
112 | The style name is case insensitive. GNU style is the default. |
||
113 | |||
114 | -v |
||
115 | --verbose |
||
116 | Print the names of the files being processed. |
||
117 | |||
118 | -V |
||
119 | --version |
||
120 | Print the version of pygettext and exit. |
||
121 | |||
122 | -w columns |
||
123 | --width=columns |
||
124 | Set width of output to columns. |
||
125 | |||
126 | -x filename |
||
127 | --exclude-file=filename |
||
128 | Specify a file that contains a list of strings that are not be |
||
129 | extracted from the input files. Each string to be excluded must |
||
130 | appear on a line by itself in the file. |
||
131 | |||
132 | -X filename |
||
133 | --no-docstrings=filename |
||
134 | Specify a file that contains a list of files (one per line) that |
||
135 | should not have their docstrings extracted. This is only useful in |
||
136 | conjunction with the -D option above. |
||
137 | |||
138 | If `inputfile' is -, standard input is read. |
||
139 | """ |
||
140 | |||
141 | import os |
||
142 | import sys |
||
143 | import time |
||
144 | import getopt |
||
145 | import tokenize |
||
146 | import operator |
||
147 | |||
148 | # for selftesting |
||
149 | try: |
||
150 | import fintl |
||
151 | _ = fintl.gettext |
||
152 | except ImportError: |
||
153 | def _(s): return s |
||
154 | |||
155 | __version__ = '1.4' |
||
156 | |||
157 | default_keywords = ['_'] |
||
158 | DEFAULTKEYWORDS = ', '.join(default_keywords) |
||
159 | |||
160 | EMPTYSTRING = '' |
||
161 | |||
162 | |||
163 | |||
164 | # The normal pot-file header. msgmerge and Emacs's po-mode work better if it's |
||
165 | # there. |
||
166 | pot_header = _('''\ |
||
167 | # SOME DESCRIPTIVE TITLE. |
||
168 | # Copyright (C) YEAR ORGANIZATION |
||
169 | # FIRST AUTHOR <EMAIL@ADDRESS>, YEAR. |
||
170 | # |
||
171 | msgid "" |
||
172 | msgstr "" |
||
173 | "Project-Id-Version: PACKAGE VERSION\\n" |
||
174 | "POT-Creation-Date: %(time)s\\n" |
||
175 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n" |
||
176 | "Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n" |
||
177 | "Language-Team: LANGUAGE <[email protected]>\\n" |
||
178 | "MIME-Version: 1.0\\n" |
||
179 | "Content-Type: text/plain; charset=CHARSET\\n" |
||
180 | "Content-Transfer-Encoding: ENCODING\\n" |
||
181 | "Generated-By: pygettext.py %(version)s\\n" |
||
182 | |||
183 | ''') |
||
184 | |||
185 | |||
186 | def usage(code, msg=''): |
||
187 | print >> sys.stderr, _(__doc__) % globals() |
||
188 | if msg: |
||
189 | print >> sys.stderr, msg |
||
190 | sys.exit(code) |
||
191 | |||
192 | |||
193 | |||
194 | escapes = [] |
||
195 | |||
196 | def make_escapes(pass_iso8859): |
||
197 | global escapes |
||
198 | if pass_iso8859: |
||
199 | # Allow iso-8859 characters to pass through so that e.g. 'msgid |
||
200 | # "H�he"' would result not result in 'msgid "H\366he"'. Otherwise we |
||
201 | # escape any character outside the 32..126 range. |
||
202 | mod = 128 |
||
203 | else: |
||
204 | mod = 256 |
||
205 | for i in range(256): |
||
206 | if 32 <= (i % mod) <= 126: |
||
207 | escapes.append(chr(i)) |
||
208 | else: |
||
209 | escapes.append("\\%03o" % i) |
||
210 | escapes[ord('\\')] = '\\\\' |
||
211 | escapes[ord('\t')] = '\\t' |
||
212 | escapes[ord('\r')] = '\\r' |
||
213 | escapes[ord('\n')] = '\\n' |
||
0 ignored issues
–
show
Comprehensibility
Best Practice
introduced
by
![]() |
|||
214 | escapes[ord('\"')] = '\\"' |
||
215 | |||
216 | |||
217 | def escape(s): |
||
218 | global escapes |
||
219 | s = list(s) |
||
220 | for i in range(len(s)): |
||
221 | s[i] = escapes[ord(s[i])] |
||
222 | return EMPTYSTRING.join(s) |
||
223 | |||
224 | |||
225 | def safe_eval(s): |
||
226 | # unwrap quotes, safely |
||
227 | return eval(s, {'__builtins__':{}}, {}) |
||
228 | |||
229 | |||
230 | def normalize(s): |
||
231 | # This converts the various Python string types into a format that is |
||
232 | # appropriate for .po files, namely much closer to C style. |
||
233 | lines = s.split('\n') |
||
234 | if len(lines) == 1: |
||
235 | s = '"' + escape(s) + '"' |
||
236 | else: |
||
237 | if not lines[-1]: |
||
238 | del lines[-1] |
||
239 | lines[-1] = lines[-1] + '\n' |
||
240 | for i in range(len(lines)): |
||
241 | lines[i] = escape(lines[i]) |
||
242 | lineterm = '\\n"\n"' |
||
243 | s = '""\n"' + lineterm.join(lines) + '"' |
||
244 | return s |
||
245 | |||
246 | |||
247 | |||
248 | class TokenEater: |
||
249 | def __init__(self, options): |
||
250 | self.__options = options |
||
251 | self.__messages = {} |
||
252 | self.__state = self.__waiting |
||
253 | self.__data = [] |
||
254 | self.__lineno = -1 |
||
255 | self.__freshmodule = 1 |
||
256 | self.__curfile = None |
||
257 | |||
258 | def __call__(self, ttype, tstring, stup, etup, line): |
||
259 | # dispatch |
||
260 | ## import token |
||
261 | ## print >> sys.stderr, 'ttype:', token.tok_name[ttype], \ |
||
262 | ## 'tstring:', tstring |
||
263 | self.__state(ttype, tstring, stup[0]) |
||
264 | |||
265 | def __waiting(self, ttype, tstring, lineno): |
||
266 | opts = self.__options |
||
267 | # Do docstring extractions, if enabled |
||
268 | if opts.docstrings and not opts.nodocstrings.get(self.__curfile): |
||
269 | # module docstring? |
||
270 | if self.__freshmodule: |
||
271 | if ttype == tokenize.STRING: |
||
272 | self.__addentry(safe_eval(tstring), lineno, isdocstring=1) |
||
273 | self.__freshmodule = 0 |
||
274 | elif ttype not in (tokenize.COMMENT, tokenize.NL): |
||
275 | self.__freshmodule = 0 |
||
276 | return |
||
277 | # class docstring? |
||
278 | if ttype == tokenize.NAME and tstring in ('class', 'def'): |
||
279 | self.__state = self.__suiteseen |
||
280 | return |
||
281 | if ttype == tokenize.NAME and tstring in opts.keywords: |
||
282 | self.__state = self.__keywordseen |
||
283 | |||
284 | def __suiteseen(self, ttype, tstring, lineno): |
||
285 | # ignore anything until we see the colon |
||
286 | if ttype == tokenize.OP and tstring == ':': |
||
287 | self.__state = self.__suitedocstring |
||
288 | |||
289 | def __suitedocstring(self, ttype, tstring, lineno): |
||
290 | # ignore any intervening noise |
||
291 | if ttype == tokenize.STRING: |
||
292 | self.__addentry(safe_eval(tstring), lineno, isdocstring=1) |
||
293 | self.__state = self.__waiting |
||
294 | elif ttype not in (tokenize.NEWLINE, tokenize.INDENT, |
||
295 | tokenize.COMMENT): |
||
296 | # there was no class docstring |
||
297 | self.__state = self.__waiting |
||
298 | |||
299 | def __keywordseen(self, ttype, tstring, lineno): |
||
300 | if ttype == tokenize.OP and tstring == '(': |
||
301 | self.__data = [] |
||
302 | self.__lineno = lineno |
||
303 | self.__state = self.__openseen |
||
304 | else: |
||
305 | self.__state = self.__waiting |
||
306 | |||
307 | def __openseen(self, ttype, tstring, lineno): |
||
308 | if ttype == tokenize.OP and tstring == ')': |
||
309 | # We've seen the last of the translatable strings. Record the |
||
310 | # line number of the first line of the strings and update the list |
||
311 | # of messages seen. Reset state for the next batch. If there |
||
312 | # were no strings inside _(), then just ignore this entry. |
||
313 | if self.__data: |
||
314 | self.__addentry(EMPTYSTRING.join(self.__data)) |
||
315 | self.__state = self.__waiting |
||
316 | elif ttype == tokenize.STRING: |
||
317 | self.__data.append(safe_eval(tstring)) |
||
318 | # TBD: should we warn if we seen anything else? |
||
319 | |||
320 | def __addentry(self, msg, lineno=None, isdocstring=0): |
||
321 | if lineno is None: |
||
322 | lineno = self.__lineno |
||
323 | if not msg in self.__options.toexclude: |
||
324 | entry = (self.__curfile, lineno) |
||
325 | self.__messages.setdefault(msg, {})[entry] = isdocstring |
||
326 | |||
327 | def set_filename(self, filename): |
||
328 | self.__curfile = filename |
||
329 | self.__freshmodule = 1 |
||
330 | |||
331 | def write(self, fp): |
||
332 | options = self.__options |
||
333 | timestamp = time.ctime(time.time()) |
||
334 | # The time stamp in the header doesn't have the same format as that |
||
335 | # generated by xgettext... |
||
336 | print >> fp, pot_header % {'time': timestamp, 'version': __version__} |
||
337 | # Sort the entries. First sort each particular entry's keys, then |
||
338 | # sort all the entries by their first item. |
||
339 | reverse = {} |
||
340 | for k, v in self.__messages.items(): |
||
341 | keys = v.keys() |
||
342 | keys.sort() |
||
343 | reverse.setdefault(tuple(keys), []).append((k, v)) |
||
344 | rkeys = reverse.keys() |
||
345 | rkeys.sort() |
||
346 | for rkey in rkeys: |
||
347 | rentries = reverse[rkey] |
||
348 | rentries.sort() |
||
349 | for k, v in rentries: |
||
350 | isdocstring = 0 |
||
351 | # If the entry was gleaned out of a docstring, then add a |
||
352 | # comment stating so. This is to aid translators who may wish |
||
353 | # to skip translating some unimportant docstrings. |
||
354 | if reduce(operator.__add__, v.values()): |
||
355 | isdocstring = 1 |
||
356 | # k is the message string, v is a dictionary-set of (filename, |
||
357 | # lineno) tuples. We want to sort the entries in v first by |
||
358 | # file name and then by line number. |
||
0 ignored issues
–
show
Comprehensibility
Best Practice
introduced
by
|
|||
359 | v = v.keys() |
||
360 | v.sort() |
||
361 | if not options.writelocations: |
||
362 | pass |
||
363 | # location comments are different b/w Solaris and GNU: |
||
364 | elif options.locationstyle == options.SOLARIS: |
||
365 | for filename, lineno in v: |
||
366 | d = {'filename': filename, 'lineno': lineno} |
||
367 | print >>fp, _( |
||
368 | '# File: %(filename)s, line: %(lineno)d') % d |
||
369 | elif options.locationstyle == options.GNU: |
||
370 | # fit as many locations on one line, as long as the |
||
371 | # resulting line length doesn't exceeds 'options.width' |
||
372 | locline = '#:' |
||
373 | for filename, lineno in v: |
||
374 | d = {'filename': filename, 'lineno': lineno} |
||
375 | s = _(' %(filename)s:%(lineno)d') % d |
||
376 | if len(locline) + len(s) <= options.width: |
||
377 | locline = locline + s |
||
378 | else: |
||
379 | print >> fp, locline |
||
380 | locline = "#:" + s |
||
381 | if len(locline) > 2: |
||
382 | print >> fp, locline |
||
383 | if isdocstring: |
||
384 | print >> fp, '#, docstring' |
||
385 | print >> fp, 'msgid', normalize(k) |
||
386 | print >> fp, 'msgstr ""\n' |
||
387 | |||
388 | |||
389 | |||
390 | def main(): |
||
391 | global default_keywords |
||
392 | try: |
||
393 | opts, args = getopt.getopt( |
||
394 | sys.argv[1:], |
||
395 | 'ad:DEhk:Kno:p:S:Vvw:x:X:', |
||
396 | ['extract-all', 'default-domain=', 'escape', 'help', |
||
397 | 'keyword=', 'no-default-keywords', |
||
398 | 'add-location', 'no-location', 'output=', 'output-dir=', |
||
399 | 'style=', 'verbose', 'version', 'width=', 'exclude-file=', |
||
400 | 'docstrings', 'no-docstrings', |
||
401 | ]) |
||
402 | except getopt.error, msg: |
||
403 | usage(1, msg) |
||
404 | |||
405 | # for holding option values |
||
406 | class Options: |
||
407 | # constants |
||
408 | GNU = 1 |
||
409 | SOLARIS = 2 |
||
410 | # defaults |
||
411 | extractall = 0 # FIXME: currently this option has no effect at all. |
||
412 | escape = 0 |
||
413 | keywords = [] |
||
414 | outpath = '' |
||
415 | outfile = 'messages.pot' |
||
416 | writelocations = 1 |
||
417 | locationstyle = GNU |
||
418 | verbose = 0 |
||
419 | width = 78 |
||
420 | excludefilename = '' |
||
421 | docstrings = 0 |
||
422 | nodocstrings = {} |
||
423 | |||
424 | options = Options() |
||
425 | locations = {'gnu' : options.GNU, |
||
426 | 'solaris' : options.SOLARIS, |
||
427 | } |
||
428 | |||
429 | # parse options |
||
430 | for opt, arg in opts: |
||
431 | if opt in ('-h', '--help'): |
||
432 | usage(0) |
||
433 | elif opt in ('-a', '--extract-all'): |
||
434 | options.extractall = 1 |
||
435 | elif opt in ('-d', '--default-domain'): |
||
436 | options.outfile = arg + '.pot' |
||
437 | elif opt in ('-E', '--escape'): |
||
438 | options.escape = 1 |
||
439 | elif opt in ('-D', '--docstrings'): |
||
440 | options.docstrings = 1 |
||
441 | elif opt in ('-k', '--keyword'): |
||
442 | options.keywords.append(arg) |
||
443 | elif opt in ('-K', '--no-default-keywords'): |
||
444 | default_keywords = [] |
||
445 | elif opt in ('-n', '--add-location'): |
||
446 | options.writelocations = 1 |
||
447 | elif opt in ('--no-location',): |
||
448 | options.writelocations = 0 |
||
449 | elif opt in ('-S', '--style'): |
||
450 | options.locationstyle = locations.get(arg.lower()) |
||
451 | if options.locationstyle is None: |
||
452 | usage(1, _('Invalid value for --style: %s') % arg) |
||
453 | elif opt in ('-o', '--output'): |
||
454 | options.outfile = arg |
||
455 | elif opt in ('-p', '--output-dir'): |
||
456 | options.outpath = arg |
||
457 | elif opt in ('-v', '--verbose'): |
||
458 | options.verbose = 1 |
||
459 | elif opt in ('-V', '--version'): |
||
460 | print _('pygettext.py (xgettext for Python) %s') % __version__ |
||
461 | sys.exit(0) |
||
462 | elif opt in ('-w', '--width'): |
||
463 | try: |
||
464 | options.width = int(arg) |
||
465 | except ValueError: |
||
466 | usage(1, _('--width argument must be an integer: %s') % arg) |
||
467 | elif opt in ('-x', '--exclude-file'): |
||
468 | options.excludefilename = arg |
||
469 | elif opt in ('-X', '--no-docstrings'): |
||
470 | fp = open(arg) |
||
471 | try: |
||
472 | while 1: |
||
473 | line = fp.readline() |
||
474 | if not line: |
||
475 | break |
||
476 | options.nodocstrings[line[:-1]] = 1 |
||
477 | finally: |
||
478 | fp.close() |
||
479 | |||
480 | # calculate escapes |
||
481 | make_escapes(options.escape) |
||
482 | |||
483 | # calculate all keywords |
||
484 | options.keywords.extend(default_keywords) |
||
485 | |||
486 | # initialize list of strings to exclude |
||
487 | if options.excludefilename: |
||
488 | try: |
||
489 | fp = open(options.excludefilename) |
||
490 | options.toexclude = fp.readlines() |
||
491 | fp.close() |
||
492 | except IOError: |
||
493 | print >> sys.stderr, _( |
||
494 | "Can't read --exclude-file: %s") % options.excludefilename |
||
495 | sys.exit(1) |
||
496 | else: |
||
497 | options.toexclude = [] |
||
498 | |||
499 | # slurp through all the files |
||
500 | eater = TokenEater(options) |
||
501 | for filename in args: |
||
502 | if filename == '-': |
||
503 | if options.verbose: |
||
504 | print _('Reading standard input') |
||
505 | fp = sys.stdin |
||
506 | closep = 0 |
||
507 | else: |
||
508 | if options.verbose: |
||
509 | print _('Working on %s') % filename |
||
510 | fp = open(filename) |
||
511 | closep = 1 |
||
512 | try: |
||
513 | eater.set_filename(filename) |
||
514 | try: |
||
515 | tokenize.tokenize(fp.readline, eater) |
||
516 | except tokenize.TokenError, e: |
||
517 | print >> sys.stderr, '%s: %s, line %d, column %d' % ( |
||
518 | e[0], filename, e[1][0], e[1][1]) |
||
519 | finally: |
||
520 | if closep: |
||
521 | fp.close() |
||
522 | |||
523 | # write the output |
||
524 | if options.outfile == '-': |
||
525 | fp = sys.stdout |
||
526 | closep = 0 |
||
527 | else: |
||
528 | if options.outpath: |
||
529 | options.outfile = os.path.join(options.outpath, options.outfile) |
||
530 | fp = open(options.outfile, 'w') |
||
531 | closep = 1 |
||
532 | try: |
||
533 | eater.write(fp) |
||
534 | finally: |
||
535 | if closep: |
||
536 | fp.close() |
||
537 | |||
538 | |||
539 | if __name__ == '__main__': |
||
540 | main() |
||
541 | # some more test strings |
||
542 | _(u'a unicode string') |
||
543 |