
Kh0                 @   s   d  Z  d d l Z d d l Z d d l Z d d l m Z d d l m Z m Z m	 Z	 d d l
 m Z d d l m Z d d l m Z d d	 l m Z Gd
 d   d e  Z d S)a  
Module containing the UniversalDetector detector class, which is the primary
class a user of ``chardet`` should use.

:author: Mark Pilgrim (initial port to Python)
:author: Shy Shalom (original C code)
:author: Dan Blanchard (major refactoring for 3.0)
:author: Ian Cordasco
    N   )CharSetGroupProber)
InputStateLanguageFilterProbingState)EscCharSetProber)Latin1Prober)MBCSGroupProber)SBCSGroupProberc               @   s   e  Z d  Z d Z d Z e j d  Z e j d  Z e j d  Z	 d d d d	 d
 d d d d d d d d d d d i Z
 e j d d  Z d d   Z d d   Z d d   Z d S)UniversalDetectoraq  
    The ``UniversalDetector`` class underlies the ``chardet.detect`` function
    and coordinates all of the different charset probers.

    To get a ``dict`` containing an encoding and its confidence, you can simply
    run:

    .. code::

            u = UniversalDetector()
            u.feed(some_bytes)
            u.close()
            detected = u.result

    g?s   [-]s   (|~{)s   [-]z
iso-8859-1zWindows-1252z
iso-8859-2zWindows-1250z
iso-8859-5zWindows-1251z
iso-8859-6zWindows-1256z
iso-8859-7zWindows-1253z
iso-8859-8zWindows-1255z
iso-8859-9zWindows-1254ziso-8859-13zWindows-1257c             C   sq   d  |  _  g  |  _ d  |  _ d  |  _ d  |  _ d  |  _ d  |  _ | |  _ t j	 t
  |  _ d  |  _ |  j   d  S)N)_esc_charset_prober_charset_probersresultdone	_got_data_input_state
_last_charlang_filterlogging	getLogger__name__logger_has_win_bytesreset)selfr    r   U/mod/web/acousticsspace/venv/lib/python3.5/site-packages/chardet/universaldetector.py__init__Q   s    									zUniversalDetector.__init__c             C   s   d d d d d d i |  _  d |  _ d |  _ d |  _ t j |  _ d |  _ |  j ra |  j j	   x |  j
 D] } | j	   qk Wd S)z
        Reset the UniversalDetector and all of its probers back to their
        initial states.  This is called by ``__init__``, so you only need to
        call this directly in between analyses of different documents.
        encodingN
confidenceg        languageF    )r   r   r   r   r   
PURE_ASCIIr   r   r   r   r   )r   proberr   r   r   r   ^   s    					zUniversalDetector.resetc             C   sF  |  j  r d St |  s d St | t  s8 t |  } |  j sc| j t j  rq d d d d d d i |  _ n | j t j	 t j
 f  r d d d d d d i |  _ n | j d	  r d d
 d d d d i |  _ nc | j d  rd d d d d d i |  _ n6 | j t j t j f  r:d d d d d d i |  _ d |  _ |  j d d k	 rcd |  _  d S|  j t j k r|  j j |  rt j |  _ n7 |  j t j k r|  j j |  j |  rt j |  _ | d d  |  _ |  j t j k rd|  j st |  j  |  _ |  j j |  t j k rBd |  j j d |  j j   d |  j j i |  _ d |  _  n |  j t j k rB|  j st  |  j  g |  _ |  j t! j" @r|  j j# t$    |  j j# t%    xZ |  j D]O } | j |  t j k rd | j d | j   d | j i |  _ d |  _  PqW|  j& j |  rBd |  _' d S)a  
        Takes a chunk of a document and feeds it through all of the relevant
        charset probers.

        After calling ``feed``, you can check the value of the ``done``
        attribute to see if you need to continue feeding the
        ``UniversalDetector`` more data, or if it has made a prediction
        (in the ``result`` attribute).

        .. note::
           You should always call ``close`` when you're done feeding in your
           document if ``done`` is not already ``True``.
        Nr   z	UTF-8-SIGr   g      ?r     zUTF-32s     zX-ISO-10646-UCS-4-3412s     zX-ISO-10646-UCS-4-2143zUTF-16Tr   )(r   len
isinstance	bytearrayr   
startswithcodecsBOM_UTF8r   BOM_UTF32_LEBOM_UTF32_BEBOM_LEBOM_BEr   r   r"   HIGH_BYTE_DETECTORsearch	HIGH_BYTEESC_DETECTORr   	ESC_ASCIIr   r   r   feedr   FOUND_ITcharset_nameget_confidencer    r   r	   r   NON_CJKappendr
   r   WIN_BYTE_DETECTORr   )r   byte_strr#   r   r   r   r5   o   s    									zUniversalDetector.feedc       	      C   s  |  j  r |  j Sd |  _  |  j s5 |  j j d  n|  j t j k re d d d d d d i |  _ n |  j t j k rNd	 } d
 } d	 } x> |  j	 D]3 } | s q | j
   } | | k r | } | } q W| rN| |  j k rN| j } | j j   } | j
   } | j d  r0|  j r0|  j j | |  } d | d | d | j i |  _ |  j j   t j k r|  j d d	 k r|  j j d  x |  j	 D] } | sqt | t  rx^ | j D]+ } |  j j d | j | j | j
    qWq|  j j d | j | j | j
    qW|  j S)z
        Stop analyzing the current document and come up with a final
        prediction.

        :returns:  The ``result`` attribute, a ``dict`` with the keys
                   `encoding`, `confidence`, and `language`.
        Tzno data received!r   asciir   g      ?r    r$   Ng        ziso-8859z no probers hit minimum thresholdz%s %s confidence = %s)r   r   r   r   debugr   r   r"   r2   r   r8   MINIMUM_THRESHOLDr7   lowerr)   r   ISO_WIN_MAPgetr    getEffectiveLevelr   DEBUGr'   r   probers)	r   prober_confidencemax_prober_confidence
max_proberr#   r7   lower_charset_namer   group_proberr   r   r   close   s`    				
			zUniversalDetector.closeN)r   
__module____qualname____doc__r?   recompiler0   r3   r;   rA   r   ALLr   r   r5   rK   r   r   r   r   r   3   s"   mr   )rN   r*   r   rO   charsetgroupproberr   enumsr   r   r   	escproberr   latin1proberr   mbcsgroupproberr	   sbcsgroupproberr
   objectr   r   r   r   r   <module>$   s   