¿Cómo hago que se pueda buscar un PDF para una aplicación de búsqueda de matraces?

He estado investigando para un proyecto personal muy importante. Me gustaría crear una aplicación de búsqueda de frascos que me permita buscar contenido en más de 100 archivos PDF. He encontrado alguna información sobre A ElasticSearch Lib que funciona bien con el matraz.

#!/usr/bin/env python3
#-*- coding: utf-8 -*-

# import libraries to help read and create PDF
import PyPDF2
from fpdf import FPDF
import base64
import json
from flask import Flask, jsonify, request, render_template,  json
from datetime import datetime
import pandas as pd

# import the Elasticsearch low-level client library
from elasticsearch import Elasticsearch
# create a new client instance of Elasticsearch
elastic_client = Elasticsearch(hosts=["localhost"])
es = Elasticsearch("http://localhost:9200/")
app = Flask(__name__)

# create a new PDF object with FPDF
pdf = FPDF()

# use an iterator to create 10 pages
for page in range(10):
    pdf.add_page()
    pdf.set_font("Arial", size=14)
    pdf.cell(150, 12, txt="Object Rocket ROCKS!!", ln=1, align="C")

# output all of the data to a new PDF file
pdf.output("object_rocket.pdf")

'''
read_pdf = PyPDF2.PdfFileReader("object_rocket.pdf")
page = read_pdf.getPage(0)
page_mode = read_pdf.getPageMode()
page_text = page.extractText()
print (type(page_text))
'''
#with open(path, 'rb') as file:

# get the PDF path and read the file
file = "Sheet3.pdf"
read_pdf = PyPDF2.PdfFileReader(file, strict=False)
#print (read_pdf)

# get the read object's meta info
pdf_meta = read_pdf.getDocumentInfo()

# get the page numbers
num = read_pdf.getNumPages()
print ("PDF pages:", num)

# create a dictionary object for page data
all_pages = {}

# put meta data into a dict key
all_pages["meta"] = {}

# Use 'iteritems()` instead of 'items()' for Python 2
for meta, value in pdf_meta.items():
    print (meta, value)
    all_pages["meta"][meta] = value

# iterate the page numbers
for page in range(num):
    data = read_pdf.getPage(page)
    #page_mode = read_pdf.getPageMode()

    # extract the page's text
    page_text = data.extractText()

    # put the text data into the dict
    all_pages[page] = page_text

# create a JSON string from the dictionary
json_data = json.dumps(all_pages)
#print ("\nJSON:", json_data)

# convert JSON string to bytes-like obj
bytes_string = bytes(json_data, 'utf-8')
#print ("\nbytes_string:", bytes_string)

# convert bytes to base64 encoded string
encoded_pdf = base64.b64encode(bytes_string)
encoded_pdf = str(encoded_pdf)
#print ("\nbase64:", encoded_pdf)

# put the PDF data into a dictionary body to pass to the API request
body_doc = {"data": encoded_pdf}

# call the index() method to index the data
result = elastic_client.index(index="pdf", doc_type="_doc", id="42", body=body_doc)

# print the returned sresults
#print ("\nindex result:", result['result'])

# make another Elasticsearch API request to get the indexed PDF
result = elastic_client.get(index="pdf", doc_type='_doc', id=42)

# print the data to terminal
result_data = result["_source"]["data"]
#print ("\nresult_data:", result_data, '-- type:', type(result_data))

# decode the base64 data (use to [:] to slice off
# the 'b and ' in the string)
decoded_pdf = base64.b64decode(result_data[2:-1]).decode("utf-8")
#print ("\ndecoded_pdf:", decoded_pdf)

# take decoded string and make into JSON object
json_dict = json.loads(decoded_pdf)
#print ("\njson_str:", json_dict, "\n\ntype:", type(json_dict))
result2 = elastic_client.index(index="pdftext", doc_type="_doc", id="42", body=json_dict)

# create new FPDF object
pdf = FPDF()

# build the new PDF from the Elasticsearch dictionary
# Use 'iteritems()` instead of 'items()' for Python 2
""" for page, value in json_data:
    if page != "meta":
        # create new page
        pdf.add_page()
        pdf.set_font("Arial", size=14)

        # add content to page
        output = value + " -- Page: " + str(int(page)+1)
        pdf.cell(150, 12, txt=output, ln=1, align="C")
    else:
        # create the meta data for the new PDF
        for meta, meta_val in json_dict["meta"].items():
            if "title" in meta.lower():
                pdf.set_title(meta_val)
            elif "producer" in meta.lower() or "creator" in meta.lower():
                pdf.set_creator(meta_val)
 """
# output the PDF object's data to a PDF file
#pdf.output("object_rocket_from_elaticsearch.pdf" )

@app.route('/', methods=['GET'])
def index():

    return jsonify(json_dict)

@app.route('/<id>', methods=['GET'])
def index_by_id(id):

    return jsonify(json_dict[id])


""" @app.route('/insert_data', methods=['PUT'])
def insert_data():
    slug = request.form['slug']
    title = request.form['title']
    content = request.form['content']

    body = {
        'slug': slug,
        'title': title,
        'content': content,
        'timestamp': datetime.now()
    }

    result = es.index(index='contents', doc_type='title', id=slug, body=body)

    return jsonify(result) """



app.run(port=5003, debug=True)

------ Progreso ------ Ahora tengo una solución de trabajo sin capacidad de búsqueda frontal:

# Load_single_PDF_BY_PAGE_TO_index.py
  #!/usr/bin/env python3
#-*- coding: utf-8 -*-

# import libraries to help read and create PDF
import PyPDF2
from fpdf import FPDF
import base64

from flask import Flask, jsonify, request, render_template,  json
from datetime import datetime
import pandas as pd

# import the Elasticsearch low-level client library
from elasticsearch import Elasticsearch
# create a new client instance of Elasticsearch
elastic_client = Elasticsearch(hosts=["localhost"])
es = Elasticsearch("http://localhost:9200/")
app = Flask(__name__)


#with open(path, 'rb') as file:

# get the PDF path and read the file
file = "Sheet3.pdf"
read_pdf = PyPDF2.PdfFileReader(file, strict=False)
#print (read_pdf)

# get the read object's meta info
pdf_meta = read_pdf.getDocumentInfo()

# get the page numbers
num = read_pdf.getNumPages()
print ("PDF pages:", num)

# create a dictionary object for page data
all_pages = {}

# put meta data into a dict key
all_pages["meta"] = {}

# Use 'iteritems()` instead of 'items()' for Python 2
for meta, value in pdf_meta.items():
    print (meta, value)
    all_pages["meta"][meta] = value

x = 44
# iterate the page numbers
for page in range(num):
    data = read_pdf.getPage(page)
    #page_mode = read_pdf.getPageMode()

    # extract the page's text
    page_text = data.extractText()

    # put the text data into the dict
    all_pages[page] = page_text

    body_doc2 = {"data": page_text}
    result3 = elastic_client.index(index="pdfclearn", doc_type="_doc", id=x, body=body_doc2)
    x += 1

El código anterior carga un solo pdf en elasticsearch por página.

from flask import Flask, jsonify, request,render_template
from elasticsearch import Elasticsearch
from datetime import datetime
es = Elasticsearch("http://localhost:9200/")

app = Flask(__name__)

@app.route('/pdf', methods=['GET'])
def index():
    results = es.get(index='pdfclearn', doc_type='_doc', id='44')
    return jsonify(results['_source'])


@app.route('/pdf/<id>', methods=['GET'])
def index_by_id(id):
    results = es.get(index='pdfclearn', doc_type='_doc', id=id)
    return jsonify(results['_source'])



@app.route('/search/<keyword>', methods=['POST','GET'])
def search(keyword):
    keyword = keyword

    body = {
        "query": {
            "multi_match": {
                "query": keyword,
                "fields": ["data"]
            }
        }
    }

    res = es.search(index="pdfclearn", doc_type="_doc", body=body)

    return jsonify(res['hits']['hits'])

@app.route("/searhbar")
def searhbar():
    return render_template("index.html")

@app.route("/searhbar/<string:box>")
def process(box):
    query = request.args.get('query')
    if box == 'names':
         keyword = box

    body = {
        "query": {
            "multi_match": {
                "query": keyword,
                "fields": ["data"]
            }
        }
    }

    res = es.search(index="pdfclearn", doc_type="_doc", body=body)

    return jsonify(res['hits']['hits'])

app.run(port=5003, debug=True)

En el código anterior, podemos buscar en todas las páginas una palabra clave o frase.

curl http://127.0.0.1:5003/search/test //it works!!

Encontré un blog sobre cómo crear archivos PDF como índice Base64 en ElasticSearch. He visto que la API de DocuSign hace esto para crear plantillas de documentos. Sin embargo, no entiendo cómo jsonificar el PDF Base64 de una manera que se pueda buscar para ElasticSearch.

curl "http://localhost:9200/pdftext/_doc/42"

curl -X POST "http://localhost:9200/pdf/_search?q=*"

Puedo recuperar la Base64 de un documento de 700 páginas. Pero creo que lo que necesito es indexar y recuperar cada página del documento.

Blogs que he estudiado que me separaron del camino:

final del juego:

https://towardsdatascience.com/create-a-full-search-engine-via-flask-elasticsearch-javascript-d3js-and-bootstrap-275f9dc6efe1

Continuaré estudiando Elastic Search y Base64 Encoding and decoding. Pero me gustaría un poco de ayuda para lograr mi objetivo. Cualquier ejemplo detallado sería muy apreciado.

python flask search BlackFox
fuente

Encontré una Lib para Python, Whoosh: whoosh.readthedocs.io/en/latest/intro.html .

Probaré

Ahora prueba una lib para Python, Scout: scout.readthedocs.io/en/latest/installation.html

BlackFox

¿Cómo hago que se pueda buscar un PDF para una aplicación de búsqueda de matraces?

Respuestas: