diff --git a/core.py b/core.py new file mode 100644 index 0000000..b6f9fc3 --- /dev/null +++ b/core.py @@ -0,0 +1,153 @@ +import json +import time,datetime +import requests as rq +import os + +base_url = "https://v2.doc2x.noedgeai.com" +key_file="E:/华为云盘/doc2x_key.txt" +if not os.path.exists(key_file): + key_file="D:/华为云盘/doc2x_key.txt" +with open(key_file, "r") as f: + secret = f.read().strip() + +def preupload(): + url = f"{base_url}/api/v2/parse/preupload" + headers = { + "Authorization": f"Bearer {secret}" + } + res = rq.post(url, headers=headers) + if res.status_code == 200: + data = res.json() + if data["code"] == "success": + return data["data"] + else: + raise Exception(f"get preupload url failed: {data}") + else: + raise Exception(f"get preupload url failed: {res.text}") + +def put_file(pdf_path: str, url: str): + with open(pdf_path, "rb") as f: + res = rq.put(url, data=f) # body为文件二进制流 + if res.status_code != 200: + raise Exception(f"put file failed: {res.text}") + +def get_status(uid: str): + url = f"{base_url}/api/v2/parse/status?uid={uid}" + headers = { + "Authorization": f"Bearer {secret}" + } + res = rq.get(url, headers=headers) + if res.status_code == 200: + data = res.json() + if data["code"] == "success": + return data["data"] + else: + raise Exception(f"get status failed: {data}") + else: + raise Exception(f"get status failed: {res.text}") + +def parse_result(uid: str, download_path: str, to: str): + url = "https://v2.doc2x.noedgeai.com/api/v2/convert/parse" + headers = { + "Authorization": f"Bearer {secret}", + "Content-Type": "application/json", + } + + data = { + "uid": uid, + "to": to, + "formula_mode": "normal", + "filename": "output."+to, + } + + response = rq.post(url, headers=headers, data=json.dumps(data)) + print(response.text) + + url_result='https://v2.doc2x.noedgeai.com/api/v2/convert/parse/result?uid='+uid + headers = {"Authorization": f"Bearer {secret}"} + + while True: + result_status = rq.get(url_result, headers=headers) + if result_status.status_code != 200: + raise Exception(f"get result failed: {result_status.text}") + else: + print(result_status.text) + if result_status.json()['data']['status'] == 'success': + download_url = result_status.json()['data']['url'] + break + elif result_status.json()['data']['status'] == 'processing': + time.sleep(3) + else: + raise Exception(f"get result failed: {result_status.text}") + downloag_res = rq.get(download_url) + dt=datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + if to=='docx': + kzm='docx' + else: + kzm='zip' + + try: + if not os.path.exists(download_path): + os.makedirs(download_path) + if download_path.find('"')!=-1 or download_path.find("'")!=-1: + download_path=download_path[1:-1] + download_path=download_path.replace('\\','/') + if download_path[-1]!='/': + download_path=download_path+'/' + + with open(download_path+'result'+dt+'.'+kzm, 'wb') as f: + f.write(downloag_res.content) + os.remove('result.json') + except Exception as e: + print(e) + print('将文件保存在当前目录') + with open('result'+dt+'.'+kzm, 'wb') as f: + f.write(downloag_res.content) + os.remove('result.json') + +def process_pdf(pdf_path: str, download_path: str, to: str="docx"): + upload_data = preupload() + print(upload_data) + url = upload_data["url"] + uid = upload_data["uid"] + print(f"uid: {uid}") + + if pdf_path.find('"')!=-1 or pdf_path.find("'")!=-1: + pdf_path=pdf_path[1:-1] + + if os.path.exists(pdf_path): + if not pdf_path.endswith(".pdf"): + raise Exception("file type not supported") + else: + put_file(pdf_path, url) + else: + raise Exception("file not exists") + + while True: + status_data = get_status(uid) + print(status_data) + if status_data["status"] == "success": + result = status_data["result"] + with open("result.json", "w") as f: + json.dump(result, f) + break + elif status_data["status"] == "failed": + detail = status_data["detail"] + raise Exception(f"parse failed: {detail}") + elif status_data["status"] == "processing": + # processing + progress = status_data["progress"] + print(f"progress: {progress}") + time.sleep(3) + + parse_result(uid, download_path, to) + +if __name__ == "__main__": + # pdf_path = input("pdf path: ") + # download_path = input("download path: ") + # process_pdf(pdf_path, download_path) + uid="0194413e-d82e-707c-b3ba-dd87e94a1d7f" + parse_result(uid,'E:/','docx') + + +