阿里云python通用文字OCR识别接口
2023年12月25日大约 1 分钟约 370 字
概述
这里使用阿里云的OCR接口来使用文件中的文字,以下是代码示例,可以复制粘贴到文件使用。
阿里云的OCR接口有很多OCR类型的识别,包括通用识别、证件照等。
OCR识别类型:https://duguang.aliyun.com/
通用文字识别示例
"""
OCR 管理后台:https://ocr.console.aliyun.com/overview
OCR OpenAPI文档:https://api.aliyun.com/api-tools/sdk/ocr-api?language=python-tea
依赖: pip install alibabacloud_ocr_api20210707==2.0.1 alibabacloud_darabonba_stream
"""
import os
import sys
from typing import List
from alibabacloud_ocr_api20210707.client import Client as ocr_api20210707Client
from alibabacloud_tea_openapi import models as open_api_models
from alibabacloud_ocr_api20210707 import models as ocr_api_20210707_models
from alibabacloud_tea_util import models as util_models
from alibabacloud_tea_util.client import Client as UtilClient
class AliyunOCR:
def __init__(self, access_key_id: str, access_key_secret: str):
config = open_api_models.Config(
# 必填,您的 AccessKey ID,
access_key_id=access_key_id,
# 必填,您的 AccessKey Secret,
access_key_secret=access_key_secret
)
# Endpoint 请参考 https://api.aliyun.com/product/ocr-api
config.endpoint = f'ocr-api.cn-hangzhou.aliyuncs.com'
self.client: ocr_api20210707Client = ocr_api20210707Client(config)
def get(self, type: str = 'General', file_url: str = None, file_stream=None, **kwargs):
"""
参数:
type (str): 识别类型,General Advanced
file_url (str): 文件URL
file_stream (str): 文件bytes流
注意:
file_url 与 file_stream 二选一
"""
recognize_all_text_request = ocr_api_20210707_models.RecognizeAllTextRequest(
type=type,
body=file_stream,
url=file_url,
**kwargs
)
runtime = util_models.RuntimeOptions()
try:
return self.client.recognize_all_text_with_options(recognize_all_text_request, runtime)
except Exception as error:
# 如有需要,请打印 error
print(error.message)
def get_content_by_general(self, *args, **kwargs):
"""通用文字识别"""
result = self.get(type='General', *args, **kwargs)
content = result.body.data.content
return content
class AutoOCR(AliyunOCR):
"""django自动导入key"""
def __init__(self):
from django.conf import settings
super().__init__(
settings.ALIBABA_CLOUD_ACCESS_KEY_ID,
settings.ALIBABA_CLOUD_ACCESS_KEY_SECRET
)
if __name__ == '__main__':
# 加载Django环境
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "conf.settings")
import django
django.setup()
ocr = AutoOCR()
file_url = 'https://www.example.com/123.pdf' # 这里替换为真实测试的pdf文件路径
result = ocr.get_content_by_general(file_url=file_url)
print(result)