fix_encoding.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378
  1. # Copyright (c) 2011 The Chromium Authors. All rights reserved.
  2. # Use of this source code is governed by a BSD-style license that can be
  3. # found in the LICENSE file.
  4. """Collection of functions and classes to fix various encoding problems on
  5. multiple platforms with python.
  6. """
  7. from __future__ import print_function
  8. import codecs
  9. import locale
  10. import os
  11. import sys
  12. # Prevents initializing multiple times.
  13. _SYS_ARGV_PROCESSED = False
  14. def complain(message):
  15. """If any exception occurs in this file, we'll probably try to print it
  16. on stderr, which makes for frustrating debugging if stderr is directed
  17. to our wrapper. So be paranoid about catching errors and reporting them
  18. to sys.__stderr__, so that the user has a higher chance to see them.
  19. """
  20. print(
  21. isinstance(message, str) and message or repr(message),
  22. file=sys.__stderr__)
  23. def fix_default_encoding():
  24. """Forces utf8 solidly on all platforms.
  25. By default python execution environment is lazy and defaults to ascii
  26. encoding.
  27. http://uucode.com/blog/2007/03/23/shut-up-you-dummy-7-bit-python/
  28. """
  29. if sys.getdefaultencoding() == 'utf-8':
  30. return False
  31. # Regenerate setdefaultencoding.
  32. reload(sys)
  33. # Module 'sys' has no 'setdefaultencoding' member
  34. # pylint: disable=no-member
  35. sys.setdefaultencoding('utf-8')
  36. for attr in dir(locale):
  37. if attr[0:3] != 'LC_':
  38. continue
  39. aref = getattr(locale, attr)
  40. try:
  41. locale.setlocale(aref, '')
  42. except locale.Error:
  43. continue
  44. try:
  45. lang, _ = locale.getdefaultlocale()
  46. except (TypeError, ValueError):
  47. continue
  48. if lang:
  49. try:
  50. locale.setlocale(aref, (lang, 'UTF-8'))
  51. except locale.Error:
  52. os.environ[attr] = lang + '.UTF-8'
  53. try:
  54. locale.setlocale(locale.LC_ALL, '')
  55. except locale.Error:
  56. pass
  57. return True
  58. ###############################
  59. # Windows specific
  60. def fix_win_sys_argv(encoding):
  61. """Converts sys.argv to 'encoding' encoded string.
  62. utf-8 is recommended.
  63. Works around <http://bugs.python.org/issue2128>.
  64. """
  65. global _SYS_ARGV_PROCESSED
  66. if _SYS_ARGV_PROCESSED:
  67. return False
  68. # These types are available on linux but not Mac.
  69. # pylint: disable=no-name-in-module,F0401
  70. from ctypes import byref, c_int, POINTER, windll, WINFUNCTYPE
  71. from ctypes.wintypes import LPCWSTR, LPWSTR
  72. # <http://msdn.microsoft.com/en-us/library/ms683156.aspx>
  73. GetCommandLineW = WINFUNCTYPE(LPWSTR)(('GetCommandLineW', windll.kernel32))
  74. # <http://msdn.microsoft.com/en-us/library/bb776391.aspx>
  75. CommandLineToArgvW = WINFUNCTYPE(POINTER(LPWSTR), LPCWSTR, POINTER(c_int))(
  76. ('CommandLineToArgvW', windll.shell32))
  77. argc = c_int(0)
  78. argv_unicode = CommandLineToArgvW(GetCommandLineW(), byref(argc))
  79. argv = [
  80. argv_unicode[i].encode(encoding, 'replace') for i in range(0, argc.value)
  81. ]
  82. if not hasattr(sys, 'frozen'):
  83. # If this is an executable produced by py2exe or bbfreeze, then it
  84. # will have been invoked directly. Otherwise, unicode_argv[0] is the
  85. # Python interpreter, so skip that.
  86. argv = argv[1:]
  87. # Also skip option arguments to the Python interpreter.
  88. while len(argv) > 0:
  89. arg = argv[0]
  90. if not arg.startswith(b'-') or arg == b'-':
  91. break
  92. argv = argv[1:]
  93. if arg == u'-m':
  94. # sys.argv[0] should really be the absolute path of the
  95. # module source, but never mind.
  96. break
  97. if arg == u'-c':
  98. argv[0] = u'-c'
  99. break
  100. sys.argv = argv
  101. _SYS_ARGV_PROCESSED = True
  102. return True
  103. def fix_win_codec():
  104. """Works around <http://bugs.python.org/issue6058>."""
  105. # <http://msdn.microsoft.com/en-us/library/dd317756.aspx>
  106. try:
  107. codecs.lookup('cp65001')
  108. return False
  109. except LookupError:
  110. codecs.register(
  111. lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None)
  112. return True
  113. class WinUnicodeOutputBase(object):
  114. """Base class to adapt sys.stdout or sys.stderr to behave correctly on
  115. Windows.
  116. Setting encoding to utf-8 is recommended.
  117. """
  118. def __init__(self, fileno, name, encoding):
  119. # Corresponding file handle.
  120. self._fileno = fileno
  121. self.encoding = encoding
  122. self.name = name
  123. self.closed = False
  124. self.softspace = False
  125. self.mode = 'w'
  126. @staticmethod
  127. def isatty():
  128. return False
  129. def close(self):
  130. # Don't really close the handle, that would only cause problems.
  131. self.closed = True
  132. def fileno(self):
  133. return self._fileno
  134. def flush(self):
  135. raise NotImplementedError()
  136. def write(self, text):
  137. raise NotImplementedError()
  138. def writelines(self, lines):
  139. try:
  140. for line in lines:
  141. self.write(line)
  142. except Exception as e:
  143. complain('%s.writelines: %r' % (self.name, e))
  144. raise
  145. class WinUnicodeConsoleOutput(WinUnicodeOutputBase):
  146. """Output adapter to a Windows Console.
  147. Understands how to use the win32 console API.
  148. """
  149. def __init__(self, console_handle, fileno, stream_name, encoding):
  150. super(WinUnicodeConsoleOutput, self).__init__(
  151. fileno, '<Unicode console %s>' % stream_name, encoding)
  152. # Handle to use for WriteConsoleW
  153. self._console_handle = console_handle
  154. # Loads the necessary function.
  155. # These types are available on linux but not Mac.
  156. # pylint: disable=no-name-in-module,F0401
  157. from ctypes import byref, GetLastError, POINTER, windll, WINFUNCTYPE
  158. from ctypes.wintypes import BOOL, DWORD, HANDLE, LPWSTR
  159. from ctypes.wintypes import LPVOID # pylint: disable=no-name-in-module
  160. self._DWORD = DWORD
  161. self._byref = byref
  162. # <http://msdn.microsoft.com/en-us/library/ms687401.aspx>
  163. self._WriteConsoleW = WINFUNCTYPE(
  164. BOOL, HANDLE, LPWSTR, DWORD, POINTER(DWORD), LPVOID)(
  165. ('WriteConsoleW', windll.kernel32))
  166. self._GetLastError = GetLastError
  167. def flush(self):
  168. # No need to flush the console since it's immediate.
  169. pass
  170. def write(self, text):
  171. try:
  172. if sys.version_info.major == 2 and not isinstance(text, unicode):
  173. # Convert to unicode.
  174. text = str(text).decode(self.encoding, 'replace')
  175. elif sys.version_info.major == 3 and isinstance(text, bytes):
  176. # Bytestrings need to be decoded to a string before being passed to
  177. # Windows.
  178. text = text.decode(self.encoding, 'replace')
  179. remaining = len(text)
  180. while remaining > 0:
  181. n = self._DWORD(0)
  182. # There is a shorter-than-documented limitation on the length of the
  183. # string passed to WriteConsoleW. See
  184. # <http://tahoe-lafs.org/trac/tahoe-lafs/ticket/1232>.
  185. retval = self._WriteConsoleW(
  186. self._console_handle, text,
  187. min(remaining, 10000),
  188. self._byref(n), None)
  189. if retval == 0 or n.value == 0:
  190. raise IOError(
  191. 'WriteConsoleW returned %r, n.value = %r, last error = %r' % (
  192. retval, n.value, self._GetLastError()))
  193. remaining -= n.value
  194. if not remaining:
  195. break
  196. text = text[int(n.value):]
  197. except Exception as e:
  198. complain('%s.write: %r' % (self.name, e))
  199. raise
  200. class WinUnicodeOutput(WinUnicodeOutputBase):
  201. """Output adaptor to a file output on Windows.
  202. If the standard FileWrite function is used, it will be encoded in the current
  203. code page. WriteConsoleW() permits writting any character.
  204. """
  205. def __init__(self, stream, fileno, encoding):
  206. super(WinUnicodeOutput, self).__init__(
  207. fileno, '<Unicode redirected %s>' % stream.name, encoding)
  208. # Output stream
  209. self._stream = stream
  210. # Flush right now.
  211. self.flush()
  212. def flush(self):
  213. try:
  214. self._stream.flush()
  215. except Exception as e:
  216. complain('%s.flush: %r from %r' % (self.name, e, self._stream))
  217. raise
  218. def write(self, text):
  219. try:
  220. if sys.version_info.major == 2 and isinstance(text, unicode):
  221. # Replace characters that cannot be printed instead of failing.
  222. text = text.encode(self.encoding, 'replace')
  223. self._stream.write(text)
  224. except Exception as e:
  225. complain('%s.write: %r' % (self.name, e))
  226. raise
  227. def win_handle_is_a_console(handle):
  228. """Returns True if a Windows file handle is a handle to a console."""
  229. # These types are available on linux but not Mac.
  230. # pylint: disable=no-name-in-module,F0401
  231. from ctypes import byref, POINTER, windll, WINFUNCTYPE
  232. from ctypes.wintypes import BOOL, DWORD, HANDLE
  233. FILE_TYPE_CHAR = 0x0002
  234. FILE_TYPE_REMOTE = 0x8000
  235. INVALID_HANDLE_VALUE = DWORD(-1).value
  236. # <http://msdn.microsoft.com/en-us/library/ms683167.aspx>
  237. GetConsoleMode = WINFUNCTYPE(BOOL, HANDLE, POINTER(DWORD))(
  238. ('GetConsoleMode', windll.kernel32))
  239. # <http://msdn.microsoft.com/en-us/library/aa364960.aspx>
  240. GetFileType = WINFUNCTYPE(DWORD, DWORD)(('GetFileType', windll.kernel32))
  241. # GetStdHandle returns INVALID_HANDLE_VALUE, NULL, or a valid handle.
  242. if handle == INVALID_HANDLE_VALUE or handle is None:
  243. return False
  244. return (
  245. (GetFileType(handle) & ~FILE_TYPE_REMOTE) == FILE_TYPE_CHAR and
  246. GetConsoleMode(handle, byref(DWORD())))
  247. def win_get_unicode_stream(stream, excepted_fileno, output_handle, encoding):
  248. """Returns a unicode-compatible stream.
  249. This function will return a direct-Console writing object only if:
  250. - the file number is the expected console file number
  251. - the handle the expected file handle
  252. - the 'real' handle is in fact a handle to a console.
  253. """
  254. old_fileno = getattr(stream, 'fileno', lambda: None)()
  255. if old_fileno == excepted_fileno:
  256. # These types are available on linux but not Mac.
  257. # pylint: disable=no-name-in-module,F0401
  258. from ctypes import windll, WINFUNCTYPE
  259. from ctypes.wintypes import DWORD, HANDLE
  260. # <http://msdn.microsoft.com/en-us/library/ms683231.aspx>
  261. GetStdHandle = WINFUNCTYPE(HANDLE, DWORD)(('GetStdHandle', windll.kernel32))
  262. real_output_handle = GetStdHandle(DWORD(output_handle))
  263. if win_handle_is_a_console(real_output_handle):
  264. # It's a console.
  265. return WinUnicodeConsoleOutput(
  266. real_output_handle, old_fileno, stream.name, encoding)
  267. # It's something else. Create an auto-encoding stream.
  268. return WinUnicodeOutput(stream, old_fileno, encoding)
  269. def fix_win_console(encoding):
  270. """Makes Unicode console output work independently of the current code page.
  271. This also fixes <http://bugs.python.org/issue1602>.
  272. Credit to Michael Kaplan
  273. <http://blogs.msdn.com/b/michkap/archive/2010/04/07/9989346.aspx> and
  274. TZOmegaTZIOY
  275. <http://stackoverflow.com/questions/878972/windows-cmd-encoding-change-causes-python-crash/1432462#1432462>.
  276. """
  277. if (isinstance(sys.stdout, WinUnicodeOutputBase) or
  278. isinstance(sys.stderr, WinUnicodeOutputBase)):
  279. return False
  280. try:
  281. # SetConsoleCP and SetConsoleOutputCP could be used to change the code page
  282. # but it's not really useful since the code here is using WriteConsoleW().
  283. # Also, changing the code page is 'permanent' to the console and needs to be
  284. # reverted manually.
  285. # In practice one needs to set the console font to a TTF font to be able to
  286. # see all the characters but it failed for me in practice. In any case, it
  287. # won't throw any exception when printing, which is the important part.
  288. # -11 and -12 are defined in stdio.h
  289. sys.stdout = win_get_unicode_stream(sys.stdout, 1, -11, encoding)
  290. sys.stderr = win_get_unicode_stream(sys.stderr, 2, -12, encoding)
  291. # TODO(maruel): Do sys.stdin with ReadConsoleW(). Albeit the limitation is
  292. # "It doesn't appear to be possible to read Unicode characters in UTF-8
  293. # mode" and this appears to be a limitation of cmd.exe.
  294. except Exception as e:
  295. complain('exception %r while fixing up sys.stdout and sys.stderr' % e)
  296. return True
  297. def fix_encoding():
  298. """Fixes various encoding problems on all platforms.
  299. Should be called at the very beginning of the process.
  300. """
  301. ret = True
  302. if sys.platform == 'win32':
  303. ret &= fix_win_codec()
  304. ret &= fix_default_encoding()
  305. if sys.platform == 'win32':
  306. encoding = sys.getdefaultencoding()
  307. ret &= fix_win_sys_argv(encoding)
  308. ret &= fix_win_console(encoding)
  309. return ret