diff --git a/ir_datasets/datasets/__init__.py b/ir_datasets/datasets/__init__.py index 41568ae..a68acda 100644 --- a/ir_datasets/datasets/__init__.py +++ b/ir_datasets/datasets/__init__.py @@ -29,6 +29,8 @@ from . import mr_tydi from . import msmarco_document from . import msmarco_document_v2 +from . import msmarco_document_v2_1 +from . import msmarco_segment_v2_1 from . import msmarco_passage from . import msmarco_passage_v2 from . import msmarco_qna diff --git a/ir_datasets/datasets/msmarco_document_v2.py b/ir_datasets/datasets/msmarco_document_v2.py index 39b9658..d3ae586 100644 --- a/ir_datasets/datasets/msmarco_document_v2.py +++ b/ir_datasets/datasets/msmarco_document_v2.py @@ -36,17 +36,21 @@ def default_text(self): class MsMarcoV2Docs(BaseDocs): - def __init__(self, dlc): + def __init__(self, dlc, docid_prefix='msmarco_doc_', docstore_size_hint=66500029281, name=NAME): super().__init__() self._dlc = dlc + self._docid_prefix = docid_prefix + self._docstore_size_hint = docstore_size_hint + self._name = name @ir_datasets.util.use_docstore def docs_iter(self): - with self._dlc.stream() as stream, \ - tarfile.open(fileobj=stream, mode='r|') as tarf: - for record in tarf: - if not record.name.endswith('.gz'): - continue + with tarfile.open(self._dlc.path(), mode='r:') as tarf: + # since there's no compression, it's fast to scan all records and sort them. + # The sorting has no effect on v2, but in v2.1, the files are out-of-sequence, so this + # addressed that problem. + records = sorted([r for r in tarf if r.name.endswith('.gz')], key=lambda x: x.name) + for record in records: file = tarf.extractfile(record) with gzip.open(file) as file: for line in file: @@ -84,18 +88,17 @@ def docs_store(self, field='doc_id'): data_cls=self.docs_cls(), lookup_field=field, index_fields=['doc_id'], - key_field_prefix='msmarco_doc_', # cut down on storage by removing prefix in lookup structure - size_hint=66500029281, - count_hint=ir_datasets.util.count_hint(NAME), + key_field_prefix=self._docid_prefix, # cut down on storage by removing prefix in lookup structure + size_hint=self._docstore_size_hint, + count_hint=ir_datasets.util.count_hint(self._name), ) - # return MsMArcoV2DocStore(self) def docs_count(self): if self.docs_store().built(): return self.docs_store().count() def docs_namespace(self): - return NAME + return self._name def docs_lang(self): return 'en' diff --git a/ir_datasets/datasets/msmarco_document_v2_1.py b/ir_datasets/datasets/msmarco_document_v2_1.py new file mode 100644 index 0000000..2b054c0 --- /dev/null +++ b/ir_datasets/datasets/msmarco_document_v2_1.py @@ -0,0 +1,31 @@ +import ir_datasets +from ir_datasets.util import DownloadConfig +from ir_datasets.datasets.base import Dataset, YamlDocumentation +from ir_datasets.formats import TsvQueries +from ir_datasets.datasets.msmarco_passage import DUA +from ir_datasets.datasets.msmarco_document_v2 import MsMarcoV2Docs + +_logger = ir_datasets.log.easy() + +NAME = 'msmarco-document-v2.1' + +def _init(): + base_path = ir_datasets.util.home_path()/NAME + documentation = YamlDocumentation(f'docs/{NAME}.yaml') + dlc = DownloadConfig.context(NAME, base_path, dua=DUA) + # we can re-use MsMarcoV2Docs, just with a few modifications directly + collection = MsMarcoV2Docs(dlc['docs'], docid_prefix='msmarco_v2.1_doc_', docstore_size_hint=59680176084, name=NAME) + subsets = {} + + subsets['trec-rag-2024'] = Dataset( + collection, + TsvQueries(dlc['rag-2024-test-topics'], namespace=NAME, lang='en'), + ) + + ir_datasets.registry.register(NAME, Dataset(collection, documentation('_'))) + for s in sorted(subsets): + ir_datasets.registry.register(f'{NAME}/{s}', Dataset(subsets[s], documentation(s))) + + return collection, subsets + +collection, subsets = _init() diff --git a/ir_datasets/datasets/msmarco_passage_v2.py b/ir_datasets/datasets/msmarco_passage_v2.py index ea43d37..e7c3de4 100644 --- a/ir_datasets/datasets/msmarco_passage_v2.py +++ b/ir_datasets/datasets/msmarco_passage_v2.py @@ -46,11 +46,22 @@ def parse_msmarco_passage(line): data['docid']) +def passage_bundle_pos_from_key(key): + (string1, string2, bundlenum, position) = key.split('_') + assert string1 == 'msmarco' and string2 == 'passage' + return f'msmarco_passage_{bundlenum}', position + class MsMarcoV2Passages(BaseDocs): - def __init__(self, dlc, pos_dlc=None): + def __init__(self, dlc, pos_dlc=None, cls=MsMarcoV2Passage, parse_passage=parse_msmarco_passage, name=NAME, docstore_size_hint=60880127751, bundle_pos_from_key=passage_bundle_pos_from_key, count=138_364_198): super().__init__() self._dlc = dlc self._pos_dlc = pos_dlc + self._cls = cls + self._parse_passage = parse_passage + self._name = name + self._docstore_size_hint = docstore_size_hint + self._bundle_pos_from_key = bundle_pos_from_key + self._count = count @ir_datasets.util.use_docstore def docs_iter(self): @@ -59,30 +70,31 @@ def docs_iter(self): # files are used (i.e., no filtering is applied) yield from self.docs_store() else: - with self._dlc.stream() as stream, \ - tarfile.open(fileobj=stream, mode='r|') as tarf: - for record in tarf: - if not record.name.endswith('.gz'): - continue + with tarfile.open(self._dlc.path(), mode='r:') as tarf: + # since there's no compression, it's fast to scan all records and sort them. + # The sorting has no effect on v2, but in v2.1, the files are out-of-sequence, so this + # addressed that problem. + records = sorted([r for r in tarf if r.name.endswith('.gz')], key=lambda x: x.name) + for record in records: file = tarf.extractfile(record) with gzip.open(file) as file: for line in file: - yield parse_msmarco_passage(line) + yield self._parse_passage(line) def docs_cls(self): - return MsMarcoV2Passage + return self._cls def docs_store(self, field='doc_id'): assert field == 'doc_id' # Unlike for msmarco-document-v2, using the docstore actually hurts performance. - return MsMarcoV2DocStore(self) + return MsMarcoV2DocStore(self, size_hint=self._docstore_size_hint, count=self._count) def docs_count(self): if self.docs_store().built(): return self.docs_store().count() def docs_namespace(self): - return NAME + return self._name def docs_lang(self): return 'en' @@ -92,7 +104,7 @@ def docs_path(self, force=True): class MsMarcoV2DocStore(ir_datasets.indices.Docstore): - def __init__(self, docs_handler): + def __init__(self, docs_handler, size_hint=60880127751, count=138_364_198): super().__init__(docs_handler.docs_cls(), 'doc_id') self.np = ir_datasets.lazy_libs.numpy() self.docs_handler = docs_handler @@ -101,29 +113,30 @@ def __init__(self, docs_handler): self.base_path = docs_handler.docs_path(force=False) + '.extracted' if not os.path.exists(self.base_path): os.makedirs(self.base_path) - self.size_hint = 60880127751 + self.size_hint = size_hint + self._count = count def get_many_iter(self, keys): self.build() # adapted from bundles = {} for key in keys: - if not key.count('_') == 3: + try: + bundlenum, position = self.docs_handler._bundle_pos_from_key(key) + except: continue - (string1, string2, bundlenum, position) = key.split('_') - assert string1 == 'msmarco' and string2 == 'passage' if bundlenum not in bundles: bundles[bundlenum] = [] bundles[bundlenum].append(int(position)) for bundlenum, positions in bundles.items(): positions = sorted(positions) - file = f'{self.base_path}/msmarco_passage_{bundlenum}' + file = f'{self.base_path}/{bundlenum}' if not os.path.exists(file): # invalid doc_id -- doesn't point to a real bundle continue if self.docs_handler._pos_dlc is not None: # check the positions are valid for these doc_ids -- only return valid ones - mmp = self.np.memmap(os.path.join(self.pos_dlc.path(), f'msmarco_passage_{bundlenum}.pos'), dtype=' +Version 2.1 of the MS MARCO document ranking dataset used in TREC RAG 2024. +

+' + bibtex_ids: [] + diff --git a/ir_datasets/etc/downloads.json b/ir_datasets/etc/downloads.json index 66b4404..b648ae6 100644 --- a/ir_datasets/etc/downloads.json +++ b/ir_datasets/etc/downloads.json @@ -4714,6 +4714,38 @@ } }, + "msmarco-document-v2.1": { + "docs": { + "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco_v2.1_doc.tar", + "size_hint": 30844989440, + "expected_md5": "a5950665d6448d3dbaf7135645f1e074", + "cache_path": "msmarco_v2.1_doc.tar", + "download_args": {"headers": {"X-Ms-Version": "2024-07-10"}} + }, + "rag-2024-test-topics": { + "url": "https://trec-rag.github.io/assets/txt/topics.rag24.test.txt", + "size_hint": 19517, + "expected_md5": "5bd6c8fa0e1300233fe139bae8288d09", + "cache_path": "trec-rag-2024-topics-test.txt" + } + }, + + "msmarco-segment-v2.1": { + "docs": { + "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco_v2.1_doc_segmented.tar", + "size_hint": 26918768640, + "expected_md5": "3799e7611efffd8daeb257e9ccca4d60", + "cache_path": "msmarco_v2.1_doc_segmented.tar", + "download_args": {"headers": {"X-Ms-Version": "2024-07-10"}} + }, + "rag-2024-test-topics": { + "url": "https://trec-rag.github.io/assets/txt/topics.rag24.test.txt", + "size_hint": 19517, + "expected_md5": "5bd6c8fa0e1300233fe139bae8288d09", + "cache_path": "trec-rag-2024-topics-test.txt" + } + }, + "msmarco-passage": { "collectionandqueries": { "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/collectionandqueries.tar.gz", diff --git a/test/integration/msmarco_document_v2_1.py b/test/integration/msmarco_document_v2_1.py new file mode 100644 index 0000000..b2680c8 --- /dev/null +++ b/test/integration/msmarco_document_v2_1.py @@ -0,0 +1,29 @@ +import re +import unittest +import ir_datasets +from ir_datasets.datasets.msmarco_document_v2 import MsMarcoV2Document +from ir_datasets.formats import TrecQrel, GenericQuery +from .base import DatasetIntegrationTest + + +_logger = ir_datasets.log.easy() + + +class TestMSMarcoV21Docs(DatasetIntegrationTest): + def test_docs(self): + self._test_docs('msmarco-document-v2.1', count=10960555, items={ + 0: MsMarcoV2Document('msmarco_v2.1_doc_00_0', 'http://0-60.reviews/0-60-times/', '0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile Times | Zero to 60 Car Reviews', '0-60 Times\n0-60 Times', re.compile('^0\\-60 Times \\- 0\\-60 \\| 0 to 60 Times \\& 1/4 Mile Times \\| Zero to 60 Car Reviews\n0\\-60 Times\nThere are man.{4332} biggest touted numbers for vehicles, and easier for people to relate to than horsepower and torque\\.$', flags=48)), + 9: MsMarcoV2Document('msmarco_v2.1_doc_00_110582', 'http://003.clayton.k12.ga.us/', 'Home - Morrow High School', 'Morrow High\nMorrow High', re.compile("^Home \\- Morrow High School\nMore Options\nSelect a School\nDISTRICT\nCCPS\nElementary\nAnderson Elementary\n.{4959}oks Site\nMs\\. Cavazos' Site\nMr\\. Holbrook's Site\nMs\\. Hunt's Site\nMs\\. Lamarre's Site\nMr\\. McClain's Site$", flags=48)), + 10960554: MsMarcoV2Document('msmarco_v2.1_doc_59_964287870', 'https://zzzzbov.com/blag/shortcut-to-zoom', 'Shortcut to Zoom › zzzzBov.com', 'Shortcut to Zoom\nShortcut to Zoom\nBatch File\nShortcut\nTrying it out\n', re.compile('^Shortcut to Zoom › zzzzBov\\.com\n07 \\- Apr \\- 2020\nShortcut to Zoom\nI use Chrome on Windows as my primar.{2297}hat adding even a few of these to my start menu will help reduce just a bit more friction in my day\\.$', flags=48)), + }) + + def test_queries(self): + self._test_queries('msmarco-document-v2.1/trec-rag-2024', count=301, items={ + 0: GenericQuery('2024-145979', 'what is vicarious trauma and how can it be coped with?'), + 9: GenericQuery('2024-158743', 'what was happening in germany and netherlands in the 1840s'), + 300: GenericQuery('2024-21669', 'do abortions kill more black people than other weapons'), + }) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/integration/msmarco_segment_v2_1.py b/test/integration/msmarco_segment_v2_1.py new file mode 100644 index 0000000..2fe48bf --- /dev/null +++ b/test/integration/msmarco_segment_v2_1.py @@ -0,0 +1,29 @@ +import re +import unittest +import ir_datasets +from ir_datasets.datasets.msmarco_segment_v2_1 import MsMarcoV21SegmentedDoc +from ir_datasets.formats import TrecQrel, GenericQuery +from .base import DatasetIntegrationTest + + +_logger = ir_datasets.log.easy() + + +class TestMSMarcoV21DocsSegmented(DatasetIntegrationTest): + def test_docs(self): + self._test_docs('msmarco-segment-v2.1', count=113520750, items={ + 0: MsMarcoV21SegmentedDoc('msmarco_v2.1_doc_00_0#0_0', 'http://0-60.reviews/0-60-times/', '0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile Times | Zero to 60 Car Reviews', '0-60 Times\n0-60 Times', re.compile('^0\\-60 Times \\- 0\\-60 \\| 0 to 60 Times \\& 1/4 Mile Times \\| Zero to 60 Car Reviews\n0\\-60 Times\nThere are man.{1078}used as the standard in the United States, where the rest of the world prefers the 0\\-100 km version\\.$', flags=48), 0, 1278, 'msmarco_v2.1_doc_00_0', 0), + 9: MsMarcoV21SegmentedDoc('msmarco_v2.1_doc_00_4810#2_16701', 'http://0-www.worldcat.org.novacat.nova.edu/identities/lccn-n79036869/', 'Ethel Percy Andrus Gerontology Center [WorldCat Identities]', re.compile('^Ethel Percy Andrus Gerontology Center\nEthel Percy Andrus Gerontology Center\nAndrus \\(Ethel Percy\\) Ger.{409}niversity of Southern California Los Angeles, Calif Ethel Percy Andrus Gerontology Center\nLanguages\n$', flags=48), re.compile('^submitted to U\\.S\\. Department of Health, Education, and Welfare, Public Health Service, Health Resea.{2311}e questionnaires used and the data derived from them, and how the data were collected and analyzed\\.$', flags=48), 2265, 4776, 'msmarco_v2.1_doc_00_4810', 2), + 113520749: MsMarcoV21SegmentedDoc('msmarco_v2.1_doc_59_964287870#4_2159633396', 'https://zzzzbov.com/blag/shortcut-to-zoom', 'Shortcut to Zoom › zzzzBov.com', 'Shortcut to Zoom\nShortcut to Zoom\nBatch File\nShortcut\nTrying it out\n', re.compile('^When it asks "What would you like to name the shortcut\\?", type the name of the meeting \\(i\\.e\\. "Standu.{333}hat adding even a few of these to my start menu will help reduce just a bit more friction in my day\\.$', flags=48), 1963, 2497, 'msmarco_v2.1_doc_59_964287870', 4), + }) + + def test_queries(self): + self._test_queries('msmarco-segment-v2.1/trec-rag-2024', count=301, items={ + 0: GenericQuery('2024-145979', 'what is vicarious trauma and how can it be coped with?'), + 9: GenericQuery('2024-158743', 'what was happening in germany and netherlands in the 1840s'), + 300: GenericQuery('2024-21669', 'do abortions kill more black people than other weapons'), + }) + + +if __name__ == '__main__': + unittest.main()