Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1######################## BEGIN LICENSE BLOCK ######################## 

2# This library is free software; you can redistribute it and/or 

3# modify it under the terms of the GNU Lesser General Public 

4# License as published by the Free Software Foundation; either 

5# version 2.1 of the License, or (at your option) any later version. 

6# 

7# This library is distributed in the hope that it will be useful, 

8# but WITHOUT ANY WARRANTY; without even the implied warranty of 

9# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 

10# Lesser General Public License for more details. 

11# 

12# You should have received a copy of the GNU Lesser General Public 

13# License along with this library; if not, write to the Free Software 

14# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 

15# 02110-1301 USA 

16######################### END LICENSE BLOCK ######################### 

17 

18 

19from .universaldetector import UniversalDetector 

20from .enums import InputState 

21from .version import __version__, VERSION 

22 

23 

24__all__ = ['UniversalDetector', 'detect', 'detect_all', '__version__', 'VERSION'] 

25 

26 

27def detect(byte_str): 

28 """ 

29 Detect the encoding of the given byte string. 

30 

31 :param byte_str: The byte sequence to examine. 

32 :type byte_str: ``bytes`` or ``bytearray`` 

33 """ 

34 if not isinstance(byte_str, bytearray): 

35 if not isinstance(byte_str, bytes): 

36 raise TypeError('Expected object of type bytes or bytearray, got: ' 

37 '{}'.format(type(byte_str))) 

38 else: 

39 byte_str = bytearray(byte_str) 

40 detector = UniversalDetector() 

41 detector.feed(byte_str) 

42 return detector.close() 

43 

44 

45def detect_all(byte_str): 

46 """ 

47 Detect all the possible encodings of the given byte string. 

48 

49 :param byte_str: The byte sequence to examine. 

50 :type byte_str: ``bytes`` or ``bytearray`` 

51 """ 

52 if not isinstance(byte_str, bytearray): 

53 if not isinstance(byte_str, bytes): 

54 raise TypeError('Expected object of type bytes or bytearray, got: ' 

55 '{}'.format(type(byte_str))) 

56 else: 

57 byte_str = bytearray(byte_str) 

58 

59 detector = UniversalDetector() 

60 detector.feed(byte_str) 

61 detector.close() 

62 

63 if detector._input_state == InputState.HIGH_BYTE: 

64 results = [] 

65 for prober in detector._charset_probers: 

66 if prober.get_confidence() > detector.MINIMUM_THRESHOLD: 

67 charset_name = prober.charset_name 

68 lower_charset_name = prober.charset_name.lower() 

69 # Use Windows encoding name instead of ISO-8859 if we saw any 

70 # extra Windows-specific bytes 

71 if lower_charset_name.startswith('iso-8859'): 

72 if detector._has_win_bytes: 

73 charset_name = detector.ISO_WIN_MAP.get(lower_charset_name, 

74 charset_name) 

75 results.append({ 

76 'encoding': charset_name, 

77 'confidence': prober.get_confidence(), 

78 'language': prober.language, 

79 }) 

80 if len(results) > 0: 

81 return sorted(results, key=lambda result: -result['confidence']) 

82 

83 return [detector.result]