Issues (229)

tools/i18n/pygettext.py (2 issues)

1
#! /usr/bin/env python2
2
# -*- coding: utf-8 -*-
3
# Originally written by Barry Warsaw <[email protected]>
4
#
5
# Minimally patched to make it even more xgettext compatible
6
# by Peter Funk <[email protected]>
7
8
"""pygettext -- Python equivalent of xgettext(1)
9
10
Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
11
internationalization of C programs.  Most of these tools are independent of
12
the programming language and can be used from within Python programs.  Martin
13
von Loewis' work[1] helps considerably in this regard.
14
15
There's one problem though; xgettext is the program that scans source code
16
looking for message strings, but it groks only C (or C++).  Python introduces
17
a few wrinkles, such as dual quoting characters, triple quoted strings, and
18
raw strings.  xgettext understands none of this.
19
20
Enter pygettext, which uses Python's standard tokenize module to scan Python
21
source code, generating .pot files identical to what GNU xgettext[2] generates
22
for C and C++ code.  From there, the standard GNU tools can be used.
23
24
A word about marking Python strings as candidates for translation.  GNU
25
xgettext recognizes the following keywords: gettext, dgettext, dcgettext, and
26
gettext_noop.  But those can be a lot of text to include all over your code.
27
C and C++ have a trick: they use the C preprocessor.  Most internationalized C
28
source includes a #define for gettext() to _() so that what has to be written
29
in the source is much less.  Thus these are both translatable strings:
30
31
    gettext("Translatable String")
32
    _("Translatable String")
33
34
Python of course has no preprocessor so this doesn't work so well.  Thus,
35
pygettext searches only for _() by default, but see the -k/--keyword flag
36
below for how to augment this.
37
38
 [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
39
 [2] http://www.gnu.org/software/gettext/gettext.html
40
41
NOTE: pygettext attempts to be option and feature compatible with GNU xgettext
42
where ever possible.  However some options are still missing or are not fully
43
implemented.  Also, xgettext's use of command line switches with option
44
arguments is broken, and in these cases, pygettext just defines additional
45
switches.
46
47
Usage: pygettext [options] inputfile ...
48
49
Options:
50
51
    -a
52
    --extract-all
53
        Extract all strings.
54
55
    -d name
56
    --default-domain=name
57
        Rename the default output file from messages.pot to name.pot.
58
59
    -E
60
    --escape
61
        Replace non-ASCII characters with octal escape sequences.
62
63
    -D
64
    --docstrings
65
        Extract module, class, method, and function docstrings.  These do not
66
        need to be wrapped in _() markers, and in fact cannot be for Python to
67
        consider them docstrings. (See also the -X option).
68
69
    -h
70
    --help
71
        Print this help message and exit.
72
73
    -k word
74
    --keyword=word
75
        Keywords to look for in addition to the default set, which are:
76
        %(DEFAULTKEYWORDS)s
77
78
        You can have multiple -k flags on the command line.
79
80
    -K
81
    --no-default-keywords
82
        Disable the default set of keywords (see above).  Any keywords
83
        explicitly added with the -k/--keyword option are still recognized.
84
85
    --no-location
86
        Do not write filename/lineno location comments.
87
88
    -n
89
    --add-location
90
        Write filename/lineno location comments indicating where each
91
        extracted string is found in the source.  These lines appear before
92
        each msgid.  The style of comments is controlled by the -S/--style
93
        option.  This is the default.
94
95
    -o filename
96
    --output=filename
97
        Rename the default output file from messages.pot to filename.  If
98
        filename is `-' then the output is sent to standard out.
99
100
    -p dir
101
    --output-dir=dir
102
        Output files will be placed in directory dir.
103
104
    -S stylename
105
    --style stylename
106
        Specify which style to use for location comments.  Two styles are
107
        supported:
108
109
        Solaris  # File: filename, line: line-number
110
        GNU      #: filename:line
111
112
        The style name is case insensitive.  GNU style is the default.
113
114
    -v
115
    --verbose
116
        Print the names of the files being processed.
117
118
    -V
119
    --version
120
        Print the version of pygettext and exit.
121
122
    -w columns
123
    --width=columns
124
        Set width of output to columns.
125
126
    -x filename
127
    --exclude-file=filename
128
        Specify a file that contains a list of strings that are not be
129
        extracted from the input files.  Each string to be excluded must
130
        appear on a line by itself in the file.
131
132
    -X filename
133
    --no-docstrings=filename
134
        Specify a file that contains a list of files (one per line) that
135
        should not have their docstrings extracted.  This is only useful in
136
        conjunction with the -D option above.
137
138
If `inputfile' is -, standard input is read.
139
"""
140
141
import os
142
import sys
143
import time
144
import getopt
145
import tokenize
146
import operator
147
148
# for selftesting
149
try:
150
    import fintl
151
    _ = fintl.gettext
152
except ImportError:
153
    def _(s): return s
154
155
__version__ = '1.4'
156
157
default_keywords = ['_']
158
DEFAULTKEYWORDS = ', '.join(default_keywords)
159
160
EMPTYSTRING = ''
161
162
163

164
# The normal pot-file header. msgmerge and Emacs's po-mode work better if it's
165
# there.
166
pot_header = _('''\
167
# SOME DESCRIPTIVE TITLE.
168
# Copyright (C) YEAR ORGANIZATION
169
# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
170
#
171
msgid ""
172
msgstr ""
173
"Project-Id-Version: PACKAGE VERSION\\n"
174
"POT-Creation-Date: %(time)s\\n"
175
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
176
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
177
"Language-Team: LANGUAGE <[email protected]>\\n"
178
"MIME-Version: 1.0\\n"
179
"Content-Type: text/plain; charset=CHARSET\\n"
180
"Content-Transfer-Encoding: ENCODING\\n"
181
"Generated-By: pygettext.py %(version)s\\n"
182
183
''')
184
185

186
def usage(code, msg=''):
187
    print >> sys.stderr, _(__doc__) % globals()
188
    if msg:
189
        print >> sys.stderr, msg
190
    sys.exit(code)
191
192
193

194
escapes = []
195
196
def make_escapes(pass_iso8859):
197
    global escapes
198
    if pass_iso8859:
199
        # Allow iso-8859 characters to pass through so that e.g. 'msgid
200
        # "H�he"' would result not result in 'msgid "H\366he"'.  Otherwise we
201
        # escape any character outside the 32..126 range.
202
        mod = 128
203
    else:
204
        mod = 256
205
    for i in range(256):
206
        if 32 <= (i % mod) <= 126:
207
            escapes.append(chr(i))
208
        else:
209
            escapes.append("\\%03o" % i)
210
    escapes[ord('\\')] = '\\\\'
211
    escapes[ord('\t')] = '\\t'
212
    escapes[ord('\r')] = '\\r'
213
    escapes[ord('\n')] = '\\n'
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable ord does not seem to be defined.
Loading history...
214
    escapes[ord('\"')] = '\\"'
215
216
217
def escape(s):
218
    global escapes
219
    s = list(s)
220
    for i in range(len(s)):
221
        s[i] = escapes[ord(s[i])]
222
    return EMPTYSTRING.join(s)
223
224
225
def safe_eval(s):
226
    # unwrap quotes, safely
227
    return eval(s, {'__builtins__':{}}, {})
228
229
230
def normalize(s):
231
    # This converts the various Python string types into a format that is
232
    # appropriate for .po files, namely much closer to C style.
233
    lines = s.split('\n')
234
    if len(lines) == 1:
235
        s = '"' + escape(s) + '"'
236
    else:
237
        if not lines[-1]:
238
            del lines[-1]
239
            lines[-1] = lines[-1] + '\n'
240
        for i in range(len(lines)):
241
            lines[i] = escape(lines[i])
242
        lineterm = '\\n"\n"'
243
        s = '""\n"' + lineterm.join(lines) + '"'
244
    return s
245
246
247

248
class TokenEater:
249
    def __init__(self, options):
250
        self.__options = options
251
        self.__messages = {}
252
        self.__state = self.__waiting
253
        self.__data = []
254
        self.__lineno = -1
255
        self.__freshmodule = 1
256
        self.__curfile = None
257
258
    def __call__(self, ttype, tstring, stup, etup, line):
259
        # dispatch
260
##        import token
261
##        print >> sys.stderr, 'ttype:', token.tok_name[ttype], \
262
##              'tstring:', tstring
263
        self.__state(ttype, tstring, stup[0])
264
265
    def __waiting(self, ttype, tstring, lineno):
266
        opts = self.__options
267
        # Do docstring extractions, if enabled
268
        if opts.docstrings and not opts.nodocstrings.get(self.__curfile):
269
            # module docstring?
270
            if self.__freshmodule:
271
                if ttype == tokenize.STRING:
272
                    self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
273
                    self.__freshmodule = 0
274
                elif ttype not in (tokenize.COMMENT, tokenize.NL):
275
                    self.__freshmodule = 0
276
                return
277
            # class docstring?
278
            if ttype == tokenize.NAME and tstring in ('class', 'def'):
279
                self.__state = self.__suiteseen
280
                return
281
        if ttype == tokenize.NAME and tstring in opts.keywords:
282
            self.__state = self.__keywordseen
283
284
    def __suiteseen(self, ttype, tstring, lineno):
285
        # ignore anything until we see the colon
286
        if ttype == tokenize.OP and tstring == ':':
287
            self.__state = self.__suitedocstring
288
289
    def __suitedocstring(self, ttype, tstring, lineno):
290
        # ignore any intervening noise
291
        if ttype == tokenize.STRING:
292
            self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
293
            self.__state = self.__waiting
294
        elif ttype not in (tokenize.NEWLINE, tokenize.INDENT,
295
                           tokenize.COMMENT):
296
            # there was no class docstring
297
            self.__state = self.__waiting
298
299
    def __keywordseen(self, ttype, tstring, lineno):
300
        if ttype == tokenize.OP and tstring == '(':
301
            self.__data = []
302
            self.__lineno = lineno
303
            self.__state = self.__openseen
304
        else:
305
            self.__state = self.__waiting
306
307
    def __openseen(self, ttype, tstring, lineno):
308
        if ttype == tokenize.OP and tstring == ')':
309
            # We've seen the last of the translatable strings.  Record the
310
            # line number of the first line of the strings and update the list
311
            # of messages seen.  Reset state for the next batch.  If there
312
            # were no strings inside _(), then just ignore this entry.
313
            if self.__data:
314
                self.__addentry(EMPTYSTRING.join(self.__data))
315
            self.__state = self.__waiting
316
        elif ttype == tokenize.STRING:
317
            self.__data.append(safe_eval(tstring))
318
        # TBD: should we warn if we seen anything else?
319
320
    def __addentry(self, msg, lineno=None, isdocstring=0):
321
        if lineno is None:
322
            lineno = self.__lineno
323
        if not msg in self.__options.toexclude:
324
            entry = (self.__curfile, lineno)
325
            self.__messages.setdefault(msg, {})[entry] = isdocstring
326
327
    def set_filename(self, filename):
328
        self.__curfile = filename
329
        self.__freshmodule = 1
330
331
    def write(self, fp):
332
        options = self.__options
333
        timestamp = time.ctime(time.time())
334
        # The time stamp in the header doesn't have the same format as that
335
        # generated by xgettext...
336
        print >> fp, pot_header % {'time': timestamp, 'version': __version__}
337
        # Sort the entries.  First sort each particular entry's keys, then
338
        # sort all the entries by their first item.
339
        reverse = {}
340
        for k, v in self.__messages.items():
341
            keys = v.keys()
342
            keys.sort()
343
            reverse.setdefault(tuple(keys), []).append((k, v))
344
        rkeys = reverse.keys()
345
        rkeys.sort()
346
        for rkey in rkeys:
347
            rentries = reverse[rkey]
348
            rentries.sort()
349
            for k, v in rentries:
350
                isdocstring = 0
351
                # If the entry was gleaned out of a docstring, then add a
352
                # comment stating so.  This is to aid translators who may wish
353
                # to skip translating some unimportant docstrings.
354
                if reduce(operator.__add__, v.values()):
355
                    isdocstring = 1
356
                # k is the message string, v is a dictionary-set of (filename,
357
                # lineno) tuples.  We want to sort the entries in v first by
358
                # file name and then by line number.
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable reduce does not seem to be defined.
Loading history...
359
                v = v.keys()
360
                v.sort()
361
                if not options.writelocations:
362
                    pass
363
                # location comments are different b/w Solaris and GNU:
364
                elif options.locationstyle == options.SOLARIS:
365
                    for filename, lineno in v:
366
                        d = {'filename': filename, 'lineno': lineno}
367
                        print >>fp, _(
368
                            '# File: %(filename)s, line: %(lineno)d') % d
369
                elif options.locationstyle == options.GNU:
370
                    # fit as many locations on one line, as long as the
371
                    # resulting line length doesn't exceeds 'options.width'
372
                    locline = '#:'
373
                    for filename, lineno in v:
374
                        d = {'filename': filename, 'lineno': lineno}
375
                        s = _(' %(filename)s:%(lineno)d') % d
376
                        if len(locline) + len(s) <= options.width:
377
                            locline = locline + s
378
                        else:
379
                            print >> fp, locline
380
                            locline = "#:" + s
381
                    if len(locline) > 2:
382
                        print >> fp, locline
383
                if isdocstring:
384
                    print >> fp, '#, docstring'
385
                print >> fp, 'msgid', normalize(k)
386
                print >> fp, 'msgstr ""\n'
387
388
389

390
def main():
391
    global default_keywords
392
    try:
393
        opts, args = getopt.getopt(
394
            sys.argv[1:],
395
            'ad:DEhk:Kno:p:S:Vvw:x:X:',
396
            ['extract-all', 'default-domain=', 'escape', 'help',
397
             'keyword=', 'no-default-keywords',
398
             'add-location', 'no-location', 'output=', 'output-dir=',
399
             'style=', 'verbose', 'version', 'width=', 'exclude-file=',
400
             'docstrings', 'no-docstrings',
401
             ])
402
    except getopt.error, msg:
403
        usage(1, msg)
404
405
    # for holding option values
406
    class Options:
407
        # constants
408
        GNU = 1
409
        SOLARIS = 2
410
        # defaults
411
        extractall = 0 # FIXME: currently this option has no effect at all.
412
        escape = 0
413
        keywords = []
414
        outpath = ''
415
        outfile = 'messages.pot'
416
        writelocations = 1
417
        locationstyle = GNU
418
        verbose = 0
419
        width = 78
420
        excludefilename = ''
421
        docstrings = 0
422
        nodocstrings = {}
423
424
    options = Options()
425
    locations = {'gnu' : options.GNU,
426
                 'solaris' : options.SOLARIS,
427
                 }
428
429
    # parse options
430
    for opt, arg in opts:
431
        if opt in ('-h', '--help'):
432
            usage(0)
433
        elif opt in ('-a', '--extract-all'):
434
            options.extractall = 1
435
        elif opt in ('-d', '--default-domain'):
436
            options.outfile = arg + '.pot'
437
        elif opt in ('-E', '--escape'):
438
            options.escape = 1
439
        elif opt in ('-D', '--docstrings'):
440
            options.docstrings = 1
441
        elif opt in ('-k', '--keyword'):
442
            options.keywords.append(arg)
443
        elif opt in ('-K', '--no-default-keywords'):
444
            default_keywords = []
445
        elif opt in ('-n', '--add-location'):
446
            options.writelocations = 1
447
        elif opt in ('--no-location',):
448
            options.writelocations = 0
449
        elif opt in ('-S', '--style'):
450
            options.locationstyle = locations.get(arg.lower())
451
            if options.locationstyle is None:
452
                usage(1, _('Invalid value for --style: %s') % arg)
453
        elif opt in ('-o', '--output'):
454
            options.outfile = arg
455
        elif opt in ('-p', '--output-dir'):
456
            options.outpath = arg
457
        elif opt in ('-v', '--verbose'):
458
            options.verbose = 1
459
        elif opt in ('-V', '--version'):
460
            print _('pygettext.py (xgettext for Python) %s') % __version__
461
            sys.exit(0)
462
        elif opt in ('-w', '--width'):
463
            try:
464
                options.width = int(arg)
465
            except ValueError:
466
                usage(1, _('--width argument must be an integer: %s') % arg)
467
        elif opt in ('-x', '--exclude-file'):
468
            options.excludefilename = arg
469
        elif opt in ('-X', '--no-docstrings'):
470
            fp = open(arg)
471
            try:
472
                while 1:
473
                    line = fp.readline()
474
                    if not line:
475
                        break
476
                    options.nodocstrings[line[:-1]] = 1
477
            finally:
478
                fp.close()
479
480
    # calculate escapes
481
    make_escapes(options.escape)
482
483
    # calculate all keywords
484
    options.keywords.extend(default_keywords)
485
486
    # initialize list of strings to exclude
487
    if options.excludefilename:
488
        try:
489
            fp = open(options.excludefilename)
490
            options.toexclude = fp.readlines()
491
            fp.close()
492
        except IOError:
493
            print >> sys.stderr, _(
494
                "Can't read --exclude-file: %s") % options.excludefilename
495
            sys.exit(1)
496
    else:
497
        options.toexclude = []
498
499
    # slurp through all the files
500
    eater = TokenEater(options)
501
    for filename in args:
502
        if filename == '-':
503
            if options.verbose:
504
                print _('Reading standard input')
505
            fp = sys.stdin
506
            closep = 0
507
        else:
508
            if options.verbose:
509
                print _('Working on %s') % filename
510
            fp = open(filename)
511
            closep = 1
512
        try:
513
            eater.set_filename(filename)
514
            try:
515
                tokenize.tokenize(fp.readline, eater)
516
            except tokenize.TokenError, e:
517
                print >> sys.stderr, '%s: %s, line %d, column %d' % (
518
                    e[0], filename, e[1][0], e[1][1])
519
        finally:
520
            if closep:
521
                fp.close()
522
523
    # write the output
524
    if options.outfile == '-':
525
        fp = sys.stdout
526
        closep = 0
527
    else:
528
        if options.outpath:
529
            options.outfile = os.path.join(options.outpath, options.outfile)
530
        fp = open(options.outfile, 'w')
531
        closep = 1
532
    try:
533
        eater.write(fp)
534
    finally:
535
        if closep:
536
            fp.close()
537
538

539
if __name__ == '__main__':
540
    main()
541
    # some more test strings
542
    _(u'a unicode string')
543