import logging import datasets from findkit import indexes import gradio as gr logging.basicConfig(level="INFO") def get_html_retrieval_results(retrieval_result, show_only_one_match_per_episode): if show_only_one_match_per_episode: retrieval_result = retrieval_result.drop_duplicates(subset=["episode"]) if len(retrieval_result) > 0: retrieval_result_html = retrieval_result.to_html(render_links=True, index=False) return retrieval_result_html else: return "" def get_retrieval_results(findkit_index, query, n_retrieved_results): retrieval_results_df = findkit_index.find_similar(query, n_retrieved_results) return retrieval_results_df.rename({"distance": "bm25_score"}) def setup_df(): podcast_dataset = datasets.load_dataset("lambdaofgod/lex_fridman_podcast")["train"] df = podcast_dataset.to_pandas() return df.dropna() def setup_index(): df = setup_df() return indexes.InMemoryBM25Index.build(df["text"], df) findkit_index = setup_index() def show_retrieval_results(query, n_retrieved_results, show_only_one_match_per_episode): retrieval_results_df = get_retrieval_results( findkit_index, query, n_retrieved_results ) return get_html_retrieval_results( retrieval_results_df, show_only_one_match_per_episode ) show_only_one_match_per_episode = gr.Checkbox( label="show only one match per episode", value=False ) n_retrieved_results = gr.Number(label="number of results", value=10, precision=0) query = gr.Textbox(label="input query", value="artificial life") demo = gr.Interface( fn=show_retrieval_results, inputs=[query, n_retrieved_results, show_only_one_match_per_episode], outputs="html", ) demo.launch()