TypeError Traceback (most recent call last)
<ipython-input-28-33fce83a7b3f> in <module>
----> 1 trafilatura.extract(downloaded, include_formatting=False, include_links=True)
/usr/local/lib/python3.8/site-packages/trafilatura/core.py in extract(filecontent, url, record_id, no_fallback, include_comments, output_format, tei_validation, target_language, include_tables, include_images, include_formatting, include_links, deduplicate, date_extraction_params, with_metadata, max_tree_size, url_blacklist, settingsfile, config)
776 url_blacklist = set()
777 # extraction
--> 778 docmeta = bare_extraction(
779 filecontent, url=url, no_fallback=no_fallback,
780 include_comments=include_comments, output_format=output_format,
/usr/local/lib/python3.8/site-packages/trafilatura/core.py in bare_extraction(filecontent, url, no_fallback, include_comments, output_format, target_language, include_tables, include_images, include_formatting, include_links, deduplicate, date_extraction_params, with_metadata, max_tree_size, url_blacklist, config)
682
683 # extract content
--> 684 postbody, temp_text, len_text, sure_thing = extract_content(cleaned_tree, include_tables, include_images, include_links, deduplicate, config)
685
686 # compare if necessary
/usr/local/lib/python3.8/site-packages/trafilatura/core.py in extract_content(tree, include_tables, include_images, include_links, deduplicate, config)
382 # list(filter(None.__ne__, processed_elems))
383 result_body.extend([e for e in
--> 384 [handle_textelem(e, potential_tags, deduplicate, config) for e in subtree.xpath('.//*')]
385 if e is not None])
386 # remove trailing titles
/usr/local/lib/python3.8/site-packages/trafilatura/core.py in <listcomp>(.0)
382 # list(filter(None.__ne__, processed_elems))
383 result_body.extend([e for e in
--> 384 [handle_textelem(e, potential_tags, deduplicate, config) for e in subtree.xpath('.//*')]
385 if e is not None])
386 # remove trailing titles
/usr/local/lib/python3.8/site-packages/trafilatura/core.py in handle_textelem(element, potential_tags, dedupbool, config)
287 new_element = handle_titles(element)
288 elif element.tag == 'p':
--> 289 new_element = handle_paragraphs(element, potential_tags, dedupbool, config)
290 elif element.tag == 'lb':
291 if text_chars_test(element.tail) is True:
/usr/local/lib/python3.8/site-packages/trafilatura/core.py in handle_paragraphs(element, potential_tags, dedupbool, config)
171 newsub.set('rend', child.get('rend'))
172 elif child.tag == 'ref':
--> 173 newsub.set('target', child.get('target'))
174 # handle line breaks
175 elif child.tag == 'lb':
src/lxml/etree.pyx in lxml.etree._Element.set()
src/lxml/apihelpers.pxi in lxml.etree._setAttributeValue()
src/lxml/apihelpers.pxi in lxml.etree._utf8()
TypeError: Argument must be bytes or unicode, got 'NoneType'
I believe this error could be handled in a meaningful way (perhaps excepted and passed?), but maybe I'm missing something?