Hello,
I'm facing an issue :
I first used the sample_figure_understanding.ipynb notebook and get markdown files generated.
Then I used the sample_rag_langchain.ipynb to split and index my md file into an Azure Search, but facing the following issue :
HttpResponseError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_25944\1031862820.py in ?()
22
23 # Initiate Azure AI Document Intelligence to load the document
24 loader = AzureAIDocumentIntelligenceLoader(file_path=file_path, api_key = doc_intelligence_key, api_endpoint = doc_intelligence_endpoint, api_model="prebuilt-layout")
25
---> 26 docs = loader.load()
27
28 # Assuming each file contains a single document for simplicity
29 docs_string = docs[0].page_content
c:\Python311\Lib\site-packages\langchain_core\document_loaders\base.py in ?(self)
28 def load(self) -> List[Document]:
29 """Load data into Document objects."""
---> 30 return list(self.lazy_load())
c:\Python311\Lib\site-packages\langchain_community\document_loaders\doc_intelligence.py in ?(self)
92 ) -> Iterator[Document]:
93 """Lazy load given path as pages."""
94 if self.file_path is not None:
95 blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
---> 96 yield from self.parser.parse(blob)
97 else:
98 yield from self.parser.parse_url(self.url_path) # type: ignore[arg-type]
c:\Python311\Lib\site-packages\langchain_core\document_loaders\base.py in ?(self, blob)
122
123 Returns:
124 List of documents
125 """
--> 126 return list(self.lazy_parse(blob))
c:\Python311\Lib\site-packages\langchain_community\document_loaders\parsers\doc_intelligence.py in ?(self, blob)
76 def lazy_parse(self, blob: Blob) -> Iterator[Document]:
77 """Lazily parse the blob."""
78
79 with blob.as_bytes_io() as file_obj:
---> 80 poller = self.client.begin_analyze_document(
81 self.api_model,
82 file_obj,
83 content_type="application/octet-stream",
c:\Python311\Lib\site-packages\azure\core\tracing\decorator.py in ?(*args, **kwargs)
74 passed_in_parent = kwargs.pop("parent_span", None)
75
76 span_impl_type = settings.tracing_implementation()
77 if span_impl_type is None:
---> 78 return func(*args, **kwargs)
79
80 # Merge span is parameter is set, but only if no explicit parent are passed
81 if merge_span and not passed_in_parent:
c:\Python311\Lib\site-packages\azure\ai\documentintelligence_operations_operations.py in ?(self, model_id, analyze_request, pages, locale, string_index_type, features, query_fields, output_content_format, **kwargs)
3623 polling: Union[bool, PollingMethod] = kwargs.pop("polling", True)
3624 lro_delay = kwargs.pop("polling_interval", self._config.polling_interval)
3625 cont_token: Optional[str] = kwargs.pop("continuation_token", None)
3626 if cont_token is None:
-> 3627 raw_result = self._analyze_document_initial( # type: ignore
3628 model_id=model_id,
3629 analyze_request=analyze_request,
3630 pages=pages,
c:\Python311\Lib\site-packages\azure\ai\documentintelligence_operations_operations.py in ?(self, model_id, analyze_request, pages, locale, string_index_type, features, query_fields, output_content_format, **kwargs)
514 if _stream:
515 response.read() # Load the body in memory and close the socket
516 map_error(status_code=response.status_code, response=response, error_map=error_map)
517 error = _deserialize(_models.ErrorResponse, response.json())
--> 518 raise HttpResponseError(response=response, model=error)
519
520 response_headers = {}
521 response_headers["Retry-After"] = self._deserialize("int", response.headers.get("Retry-After"))
HttpResponseError: (InvalidRequest) Invalid request.
Code: InvalidRequest
Message: Invalid request.
Inner error: {
"code": "InvalidContent",
"message": "The file is corrupted or format is unsupported. Refer to documentation for the list of supported formats."
}
When I look the markdown file generated, I can see that titles (#) are represented by "==="
(I tried to manually make the change, but still facing the same issue, can anybody help ?