U
    }gE                     @  s"  d Z ddlmZ dZdgZddlmZ ddlmZm	Z	m
Z
mZmZmZmZmZmZmZmZ ddlmZmZmZmZmZmZ ddlmZmZ dd	lmZmZm Z m!Z! dd
l"m#Z# erddl$m%Z% ddlm&Z& ddl'm(Z(m)Z)m*Z* dZ+e	ee,e,f e,e,gdf Z-G dd deeZ.G dd de Z/dS )zCUse the HTMLParser library to parse HTML files that aren't too bad.    )annotationsMITHTMLParserTreeBuilder)
HTMLParser)AnyCallablecastDictIterableListOptionalTYPE_CHECKINGTupleTypeUnion)AttributeDictCDataCommentDeclarationDoctypeProcessingInstruction)EntitySubstitutionUnicodeDammit)DetectsXMLParsedAsHTMLHTMLHTMLTreeBuilderSTRICTParserRejectedMarkup)BeautifulSoup)NavigableString)	_Encoding
_Encodings
_RawMarkupzhtml.parserNc                   @  s  e Zd ZU dZded< dZded< edddd	dd
ddZd	ed< ded< ded< dddddZddddddZd2dddddddZ	d3dddddd Z
ddd!d"d#Zddd$d%d&Zddd$d'd(Zddd!d)d*Zddd!d+d,Zddd!d-d.Zddd!d/d0Zd1S )4BeautifulSoupHTMLParserreplacestrREPLACEignoreIGNOREon_duplicate_attributer   r   z&Union[str, _DuplicateAttributeHandler])soupargsr+   kwargsc                O  s:   || _ || _|jj| _tj| f|| g | _|   d S N)r,   r+   builderattribute_dict_classr   __init__already_closed_empty_elementZ_initialize_xml_detector)selfr,   r+   r-   r.    r5   ;/tmp/pip-unpacked-wheel-kgiupv3k/bs4/builder/_htmlparser.pyr2   T   s    
	z BeautifulSoupHTMLParser.__init__r+   z	List[str]r3   r,   None)messagereturnc                 C  s   t |d S r/   r   )r4   r8   r5   r5   r6   erroro   s    zBeautifulSoupHTMLParser.errorzList[Tuple[str, Optional[str]]])nameattrsr9   c                 C  s   | j ||dd | | dS )zHandle an incoming empty-element tag.

        html.parser only calls this method when the markup looks like
        <tag/>.
        F)handle_empty_elementN)handle_starttaghandle_endtag)r4   r;   r<   r5   r5   r6   handle_startendtag   s    z*BeautifulSoupHTMLParser.handle_startendtagTbool)r;   r<   r=   r9   c                 C  s   |   }|D ]f\}}|dkr d}||krj| j}|| jkr:qr|d| jfkrR|||< qrtt|}|||| q|||< q| jjjr| 	 \}}	nd }}	| jj
|dd|||	d}
|
r|
jr|r| j|dd | j| | jdkr| | dS )zHandle an opening tag, e.g. '<tag>'

        :param handle_empty_element: True if this tag is known to be
            an empty-element tag (i.e. there is not expected to be any
            closing tag).
        N )
sourceline	sourceposF)check_already_closed)r1   r+   r)   r'   r   _DuplicateAttributeHandlerr,   r0   Zstore_line_numbersgetposr>   Zis_empty_elementr?   r3   appendZ_root_tag_nameZ_root_tag_encountered)r4   r;   r<   r=   Z	attr_dictkeyvalueZon_duperC   rD   tagr5   r5   r6   r>      s:    




     

z'BeautifulSoupHTMLParser.handle_starttag)r;   rE   r9   c                 C  s,   |r|| j kr| j | n| j| dS )zHandle a closing tag, e.g. '</tag>'

        :param name: A tag name.
        :param check_already_closed: True if this tag is expected to
           be the closing portion of an empty-element tag,
           e.g. '<tag></tag>'.
        N)r3   remover,   r?   )r4   r;   rE   r5   r5   r6   r?      s    	z%BeautifulSoupHTMLParser.handle_endtag)datar9   c                 C  s   | j | dS )z4Handle some textual data that shows up between tags.N)r,   handle_datar4   rM   r5   r5   r6   rN      s    z#BeautifulSoupHTMLParser.handle_data)r;   r9   c              	   C  s   | drt|dd}n$| dr8t|dd}nt|}d}|dk r| jjdfD ]4}|sbqXzt|g|}W qX tk
r   Y qXX qX|szt|}W n t	t
fk
r   Y nX |pd}| | dS )zHandle a numeric character reference by converting it to the
        corresponding Unicode character and treating it as textual
        data.

        :param name: Character number, possibly in hexadecimal.
        x   XN   zwindows-1252u   �)
startswithintlstripr,   original_encoding	bytearraydecodeUnicodeDecodeErrorchr
ValueErrorOverflowErrorrN   )r4   r;   Z	real_namerM   encodingr5   r5   r6   handle_charref   s*    

z&BeautifulSoupHTMLParser.handle_charrefc                 C  s0   t j|}|dk	r|}nd| }| | dS )zHandle a named entity reference by converting it to the
        corresponding Unicode character(s) and treating it as textual
        data.

        :param name: Name of the entity reference.
        Nz&%s)r   ZHTML_ENTITY_TO_CHARACTERgetrN   )r4   r;   	characterrM   r5   r5   r6   handle_entityref
  s
    z(BeautifulSoupHTMLParser.handle_entityrefc                 C  s&   | j   | j | | j t dS )zOHandle an HTML comment.

        :param data: The text of the comment.
        N)r,   endDatarN   r   rO   r5   r5   r6   handle_comment  s    
z&BeautifulSoupHTMLParser.handle_commentc                 C  s6   | j   |tdd }| j | | j t dS )zYHandle a DOCTYPE declaration.

        :param data: The text of the declaration.
        zDOCTYPE N)r,   rc   lenrN   r   rO   r5   r5   r6   handle_decl&  s    
z#BeautifulSoupHTMLParser.handle_declc                 C  sN   |  dr$t}|tdd }nt}| j  | j| | j| dS )z{Handle a declaration of unknown type -- probably a CDATA block.

        :param data: The text of the declaration.
        zCDATA[N)upperrT   r   re   r   r,   rc   rN   )r4   rM   clsr5   r5   r6   unknown_decl0  s    
z$BeautifulSoupHTMLParser.unknown_declc                 C  s0   | j   | j | | | | j t dS )z\Handle a processing instruction.

        :param data: The text of the instruction.
        N)r,   rc   rN   Z_document_might_be_xmlr   rO   r5   r5   r6   	handle_pi?  s    

z!BeautifulSoupHTMLParser.handle_piN)T)T)__name__
__module____qualname__r'   __annotations__r)   r2   r:   r@   r>   r?   rN   r_   rb   rd   rf   ri   rj   r5   r5   r5   r6   r$   =   s&   
 >(	
r$   c                      s   e Zd ZU dZdZded< dZded< eZded< ee	e
gZd	ed
< ded< dZded< d dddd fddZd!ddddddddZdddddZ  ZS )"r   zA Beautiful soup `bs4.builder.TreeBuilder` that uses the
    :py:class:`html.parser.HTMLParser` parser, found in the Python
    standard library.

    FrA   is_xmlT	picklabler&   NAMEzIterable[str]featuresz$Tuple[Iterable[Any], Dict[str, Any]]parser_argsTRACKS_LINE_NUMBERSNzOptional[Iterable[Any]]zOptional[Dict[str, Any]]r   )rs   parser_kwargsr.   c                   sl   t  }dD ]}||kr
||}|||< q
tt| jf | |pBg }|pJi }|| d|d< ||f| _dS )a  Constructor.

        :param parser_args: Positional arguments to pass into
            the BeautifulSoupHTMLParser constructor, once it's
            invoked.
        :param parser_kwargs: Keyword arguments to pass into
            the BeautifulSoupHTMLParser constructor, once it's
            invoked.
        :param kwargs: Keyword arguments for the superclass constructor.
        r*   Fconvert_charrefsN)dictpopsuperr   r2   updaters   )r4   rs   ru   r.   Zextra_parser_kwargsargrJ   	__class__r5   r6   r2   [  s    


zHTMLParserTreeBuilder.__init__r#   zOptional[_Encoding]zOptional[_Encodings]zDIterable[Tuple[str, Optional[_Encoding], Optional[_Encoding], bool]])markupuser_specified_encodingdocument_declared_encodingexclude_encodingsr9   c                 c  s   t |tr|dddfV  dS g }|r.|| g }|r@|| t|||d|d}|jdkrftdn|j|j|j|jfV  dS )a2  Run any preliminary steps necessary to make incoming markup
        acceptable to the parser.

        :param markup: Some markup -- probably a bytestring.
        :param user_specified_encoding: The user asked to try this encoding.
        :param document_declared_encoding: The markup itself claims to be
            in this encoding.
        :param exclude_encodings: The user asked _not_ to try any of
            these encodings.

        :yield: A series of 4-tuples: (markup, encoding, declared encoding,
             has undergone character replacement)

            Each 4-tuple represents a strategy for parsing the document.
            This TreeBuilder uses Unicode, Dammit to convert the markup
            into Unicode, so the ``markup`` element of the tuple will
            always be a string.
        NFT)known_definite_encodingsuser_encodingsZis_htmlr   zPCould not convert input to Unicode, and html.parser will not accept bytestrings.)	
isinstancer&   rH   r   Zunicode_markupr   rW   Zdeclared_html_encodingZcontains_replacement_characters)r4   r~   r   r   r   r   r   Zdammitr5   r5   r6   prepare_markupy  s2    



z$HTMLParserTreeBuilder.prepare_markupr7   )r~   r9   c              
   C  s   | j \}}t|tst| jd k	s&tt| jf||}z|| |  W n* tk
rx } zt|W 5 d }~X Y nX g |_	d S r/   )
rs   r   r&   AssertionErrorr,   r$   feedcloser   r3   )r4   r~   r-   r.   parserer5   r5   r6   r     s    

zHTMLParserTreeBuilder.feed)NN)NNN)rk   rl   rm   __doc__ro   rn   rp   
HTMLPARSERrq   r   r   rr   rt   r2   r   r   __classcell__r5   r5   r|   r6   r   J  s   
  !   H)0r   
__future__r   __license____all__html.parserr   typingr   r   r   r	   r
   r   r   r   r   r   r   Zbs4.elementr   r   r   r   r   r   Z
bs4.dammitr   r   Zbs4.builderr   r   r   r   Zbs4.exceptionsr   Zbs4r   r    Zbs4._typingr!   r"   r#   r   r&   rF   r$   r   r5   r5   r5   r6   <module>   s(   4   