INFS7410 Information Retrieval and Web Search - Project Part 1 - Web Passage Ranking
In [ ]:
stemming = None # None or 'poter' or anything else
stopwords = False # False or True
index = 'indexes/lucene-index-msmarco-passage-xxxx/'
In [ ]:
from pyserini.search import SimpleSearcher
from pyserini.analysis import Analyzer, get_lucene_analyzer
from pyserini.index import IndexReader
from tqdm import tqdm
lucene_analyzer = get_lucene_analyzer(stemming=stemming, stopwords=stopwords)
analyzer = Analyzer(lucene_analyzer)
searcher = SimpleSearcher(index)
searcher.set_analyzer(lucene_analyzer)
index_reader = IndexReader(index)
# Create document frequency dictionary to speed up scoring later, this will take around 2 min.
df_dict = {}
for term in tqdm(index_reader.terms(), desc="loading idf dictionary:"):
df_dict[term.term] = term.df
# cache document length and docids for the collection, this will take around 2 mins.
doc_len_dict = {}
doc_id_dict = {}
with open ('collection/collection.tsv', 'r') as f:
lines = f.readlines()
for line in tqdm(lines, desc="loading doc_length dictionary:"):
docid, text = line.split('\t')
doc_len_dict[docid] = len(text.split())
internal_id = index_reader.convert_collection_docid_to_internal_docid(docid)
doc_id_dict[internal_id] = docid
In [ ]:
def search(query: str, k: int=1000, scorer=None):
"""
Inputs:
query (str): the query string to perform the search.
k (int): the number of documents to be returned.
scorer: your implemented scoring function, such as bm25.
Output:
results (list): the sorted result list, a list of tuples.
The first element in the tuples is the docid,
the second is the doc score.
"""
assert scorer is not None
print("-----------------------------------------------------")
print("Current query:", query)
# get the analyzed term list
q_terms = analyzer.analyze(query)
doc_socres = {}
for term in q_terms:
# get the posting list for the current term
postings_list = index_reader.get_postings_list(term, analyzer=None)
if postings_list is not None:
# get the document frequency of the current term
df = df_dict[term]
# iterate the posting list
for posting in tqdm(postings_list, desc=f"Iterate posting for term '{term}'"):
internal_id = posting.docid
# convert pyserini internal docid to the actual docid
docid = doc_id_dict[internal_id]
tf = posting.tf
doc_len = doc_len_dict[docid]
# Call the scoring function (you will implement these below).
score = scorer(tf, df, doc_len)
if docid in doc_socres.keys():
doc_socres[docid] += score
else:
doc_socres[docid] = score
# Sort the results by the score.
results = [(docid, doc_socre) for docid, doc_socre in doc_socres.items()]
results = sorted(results, key=lambda x: x[1], reverse=True)[:k]
return results
print("-----------------------------------------------------")
In [ ]:
# Import all your python libraries and put setup code here.
In [ ]:
# Put your implementation of BM25 here, including parameter tuning.
In [ ]:
# Put your implementations for the gain-loss plots here.