diff --git a/ir_datasets/datasets/__init__.py b/ir_datasets/datasets/__init__.py
index 41568ae..a68acda 100644
--- a/ir_datasets/datasets/__init__.py
+++ b/ir_datasets/datasets/__init__.py
@@ -29,6 +29,8 @@
from . import mr_tydi
from . import msmarco_document
from . import msmarco_document_v2
+from . import msmarco_document_v2_1
+from . import msmarco_segment_v2_1
from . import msmarco_passage
from . import msmarco_passage_v2
from . import msmarco_qna
diff --git a/ir_datasets/datasets/msmarco_document_v2.py b/ir_datasets/datasets/msmarco_document_v2.py
index 39b9658..d3ae586 100644
--- a/ir_datasets/datasets/msmarco_document_v2.py
+++ b/ir_datasets/datasets/msmarco_document_v2.py
@@ -36,17 +36,21 @@ def default_text(self):
class MsMarcoV2Docs(BaseDocs):
- def __init__(self, dlc):
+ def __init__(self, dlc, docid_prefix='msmarco_doc_', docstore_size_hint=66500029281, name=NAME):
super().__init__()
self._dlc = dlc
+ self._docid_prefix = docid_prefix
+ self._docstore_size_hint = docstore_size_hint
+ self._name = name
@ir_datasets.util.use_docstore
def docs_iter(self):
- with self._dlc.stream() as stream, \
- tarfile.open(fileobj=stream, mode='r|') as tarf:
- for record in tarf:
- if not record.name.endswith('.gz'):
- continue
+ with tarfile.open(self._dlc.path(), mode='r:') as tarf:
+ # since there's no compression, it's fast to scan all records and sort them.
+ # The sorting has no effect on v2, but in v2.1, the files are out-of-sequence, so this
+ # addressed that problem.
+ records = sorted([r for r in tarf if r.name.endswith('.gz')], key=lambda x: x.name)
+ for record in records:
file = tarf.extractfile(record)
with gzip.open(file) as file:
for line in file:
@@ -84,18 +88,17 @@ def docs_store(self, field='doc_id'):
data_cls=self.docs_cls(),
lookup_field=field,
index_fields=['doc_id'],
- key_field_prefix='msmarco_doc_', # cut down on storage by removing prefix in lookup structure
- size_hint=66500029281,
- count_hint=ir_datasets.util.count_hint(NAME),
+ key_field_prefix=self._docid_prefix, # cut down on storage by removing prefix in lookup structure
+ size_hint=self._docstore_size_hint,
+ count_hint=ir_datasets.util.count_hint(self._name),
)
- # return MsMArcoV2DocStore(self)
def docs_count(self):
if self.docs_store().built():
return self.docs_store().count()
def docs_namespace(self):
- return NAME
+ return self._name
def docs_lang(self):
return 'en'
diff --git a/ir_datasets/datasets/msmarco_document_v2_1.py b/ir_datasets/datasets/msmarco_document_v2_1.py
new file mode 100644
index 0000000..2b054c0
--- /dev/null
+++ b/ir_datasets/datasets/msmarco_document_v2_1.py
@@ -0,0 +1,31 @@
+import ir_datasets
+from ir_datasets.util import DownloadConfig
+from ir_datasets.datasets.base import Dataset, YamlDocumentation
+from ir_datasets.formats import TsvQueries
+from ir_datasets.datasets.msmarco_passage import DUA
+from ir_datasets.datasets.msmarco_document_v2 import MsMarcoV2Docs
+
+_logger = ir_datasets.log.easy()
+
+NAME = 'msmarco-document-v2.1'
+
+def _init():
+ base_path = ir_datasets.util.home_path()/NAME
+ documentation = YamlDocumentation(f'docs/{NAME}.yaml')
+ dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
+ # we can re-use MsMarcoV2Docs, just with a few modifications directly
+ collection = MsMarcoV2Docs(dlc['docs'], docid_prefix='msmarco_v2.1_doc_', docstore_size_hint=59680176084, name=NAME)
+ subsets = {}
+
+ subsets['trec-rag-2024'] = Dataset(
+ collection,
+ TsvQueries(dlc['rag-2024-test-topics'], namespace=NAME, lang='en'),
+ )
+
+ ir_datasets.registry.register(NAME, Dataset(collection, documentation('_')))
+ for s in sorted(subsets):
+ ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s)))
+
+ return collection, subsets
+
+collection, subsets = _init()
diff --git a/ir_datasets/datasets/msmarco_passage_v2.py b/ir_datasets/datasets/msmarco_passage_v2.py
index ea43d37..e7c3de4 100644
--- a/ir_datasets/datasets/msmarco_passage_v2.py
+++ b/ir_datasets/datasets/msmarco_passage_v2.py
@@ -46,11 +46,22 @@ def parse_msmarco_passage(line):
data['docid'])
+def passage_bundle_pos_from_key(key):
+ (string1, string2, bundlenum, position) = key.split('_')
+ assert string1 == 'msmarco' and string2 == 'passage'
+ return f'msmarco_passage_{bundlenum}', position
+
class MsMarcoV2Passages(BaseDocs):
- def __init__(self, dlc, pos_dlc=None):
+ def __init__(self, dlc, pos_dlc=None, cls=MsMarcoV2Passage, parse_passage=parse_msmarco_passage, name=NAME, docstore_size_hint=60880127751, bundle_pos_from_key=passage_bundle_pos_from_key, count=138_364_198):
super().__init__()
self._dlc = dlc
self._pos_dlc = pos_dlc
+ self._cls = cls
+ self._parse_passage = parse_passage
+ self._name = name
+ self._docstore_size_hint = docstore_size_hint
+ self._bundle_pos_from_key = bundle_pos_from_key
+ self._count = count
@ir_datasets.util.use_docstore
def docs_iter(self):
@@ -59,30 +70,31 @@ def docs_iter(self):
# files are used (i.e., no filtering is applied)
yield from self.docs_store()
else:
- with self._dlc.stream() as stream, \
- tarfile.open(fileobj=stream, mode='r|') as tarf:
- for record in tarf:
- if not record.name.endswith('.gz'):
- continue
+ with tarfile.open(self._dlc.path(), mode='r:') as tarf:
+ # since there's no compression, it's fast to scan all records and sort them.
+ # The sorting has no effect on v2, but in v2.1, the files are out-of-sequence, so this
+ # addressed that problem.
+ records = sorted([r for r in tarf if r.name.endswith('.gz')], key=lambda x: x.name)
+ for record in records:
file = tarf.extractfile(record)
with gzip.open(file) as file:
for line in file:
- yield parse_msmarco_passage(line)
+ yield self._parse_passage(line)
def docs_cls(self):
- return MsMarcoV2Passage
+ return self._cls
def docs_store(self, field='doc_id'):
assert field == 'doc_id'
# Unlike for msmarco-document-v2, using the docstore actually hurts performance.
- return MsMarcoV2DocStore(self)
+ return MsMarcoV2DocStore(self, size_hint=self._docstore_size_hint, count=self._count)
def docs_count(self):
if self.docs_store().built():
return self.docs_store().count()
def docs_namespace(self):
- return NAME
+ return self._name
def docs_lang(self):
return 'en'
@@ -92,7 +104,7 @@ def docs_path(self, force=True):
class MsMarcoV2DocStore(ir_datasets.indices.Docstore):
- def __init__(self, docs_handler):
+ def __init__(self, docs_handler, size_hint=60880127751, count=138_364_198):
super().__init__(docs_handler.docs_cls(), 'doc_id')
self.np = ir_datasets.lazy_libs.numpy()
self.docs_handler = docs_handler
@@ -101,29 +113,30 @@ def __init__(self, docs_handler):
self.base_path = docs_handler.docs_path(force=False) + '.extracted'
if not os.path.exists(self.base_path):
os.makedirs(self.base_path)
- self.size_hint = 60880127751
+ self.size_hint = size_hint
+ self._count = count
def get_many_iter(self, keys):
self.build()
# adapted from