diff --git a/Changelog.rst b/Changelog.rst index cf2c6e3492..70dd328c88 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -3,6 +3,10 @@ Version NEXTVERSION **2026-??-??** +* Read Kerchunk datasets with `cf.read` + (https://github.com/NCAS-CMS/cf-python/936) +* Read open file handle datasets with `cf.read` + (https://github.com/NCAS-CMS/cf-python/issues/937) * Support for HEALPix grids (https://github.com/NCAS-CMS/cf-python/issues/909) * New HEALPix methods: `cf.Field.healpix_info`, diff --git a/cf/read_write/read.py b/cf/read_write/read.py index 614f407f2f..22732550cf 100644 --- a/cf/read_write/read.py +++ b/cf/read_write/read.py @@ -20,8 +20,8 @@ class read(cfdm.read): """Read field or domain constructs from files. - The following file formats are supported: netCDF, CDL, Zarr, PP, - and UM fields file. + The following file formats are supported: netCDF, CDL, Zarr, + Kerchunk, PP, and UM fields file. NetCDF and Zarr datasets may be on local disk, on an OPeNDAP server, or in an S3 object store. @@ -144,7 +144,7 @@ class read(cfdm.read): :Parameters: - {{read datasets: (arbitrarily nested sequence of) `str`}} + {{read datasets:}} {{read recursive: `bool`, optional}} @@ -162,6 +162,7 @@ class read(cfdm.read): ``'netCDF'`` A netCDF-3 or netCDF-4 dataset ``'CDL'`` A text CDL file of a netCDF dataset ``'Zarr'`` A Zarr v2 (xarray) or Zarr v3 dataset + ``'Kerchunk'`` A Kerchunked dataset ``'UM'`` A UM fields file or PP dataset ============== ========================================== diff --git a/cf/read_write/um/umread.py b/cf/read_write/um/umread.py index 049b538e9b..4a109abced 100644 --- a/cf/read_write/um/umread.py +++ b/cf/read_write/um/umread.py @@ -3549,6 +3549,13 @@ def read( "(only Field constructs)" ) + representation = self.dataset_representation(dataset) + if representation != "path": + raise NotImplementedError( + "Can't yet read Field constructs from a UM or PP " + f"{representation!r} dataset: {dataset!r}" + ) + if not _stash2standard_name: # -------------------------------------------------------- # Create the STASH code to standard_name conversion @@ -3835,6 +3842,41 @@ def dataset_open(self, filename, parse=True): parse=parse, ) + @classmethod + def dataset_representation(cls, dataset): + """Return the logical representation type of the input dataset. + + .. versionadded:: NEXTVERSION + + :Parameters: + + dataset: + The dataset. May be a string-valued path or a + file-like object. + + :Returns: + + `str` + The dataset representation: + + * ``'path'``: A string-valued path. + + * ``'file_handle'``: An open file handle (such as + returned by `fsspec.filesystem.open`) + + * ``'unknown'``: Anything else. + + """ + # Strings (Paths) + if isinstance(dataset, str): + return "path" + + # Check for a "binary stream" (file handle) + if hasattr(dataset, "read") and hasattr(dataset, "seek"): + return "file_handle" + + return "unknown" + """ Problems: diff --git a/cf/test/create_test_files.py b/cf/test/create_test_files.py index e2b6cf6f48..604f38c787 100644 --- a/cf/test/create_test_files.py +++ b/cf/test/create_test_files.py @@ -2228,6 +2228,150 @@ def _make_ugrid_2(filename): return filename +def _make_ugrid_3(filename): + """Create a UGRID mesh topology and no fields/domains.""" + n = netCDF4.Dataset(filename, "w") + + n.Conventions = f"CF-{VN}" + + n.createDimension("nMesh3_node", 7) + n.createDimension("nMesh3_edge", 9) + n.createDimension("nMesh3_face", 3) + n.createDimension("connectivity2", 2) + n.createDimension("connectivity4", 4) + n.createDimension("connectivity5", 5) + + Mesh3 = n.createVariable("Mesh3", "i4", ()) + Mesh3.cf_role = "mesh_topology" + Mesh3.topology_dimension = 2 + Mesh3.node_coordinates = "Mesh3_node_x Mesh3_node_y" + Mesh3.face_node_connectivity = "Mesh3_face_nodes" + Mesh3.edge_node_connectivity = "Mesh3_edge_nodes" + Mesh3.face_dimension = "nMesh3_face" + Mesh3.edge_dimension = "nMesh3_edge" + Mesh3.face_face_connectivity = "Mesh3_face_links" + Mesh3.edge_edge_connectivity = "Mesh3_edge_links" + + # Node + Mesh3_node_x = n.createVariable("Mesh3_node_x", "f4", ("nMesh3_node",)) + Mesh3_node_x.standard_name = "longitude" + Mesh3_node_x.units = "degrees_east" + Mesh3_node_x[...] = [-45, -43, -45, -43, -45, -43, -40] + + Mesh3_node_y = n.createVariable("Mesh3_node_y", "f4", ("nMesh3_node",)) + Mesh3_node_y.standard_name = "latitude" + Mesh3_node_y.units = "degrees_north" + Mesh3_node_y[...] = [35, 35, 33, 33, 31, 31, 34] + + Mesh3_edge_nodes = n.createVariable( + "Mesh3_edge_nodes", "i4", ("nMesh3_edge", "connectivity2") + ) + Mesh3_edge_nodes.long_name = "Maps every edge to its two nodes" + Mesh3_edge_nodes[...] = [ + [1, 6], + [3, 6], + [3, 1], + [0, 1], + [2, 0], + [2, 3], + [2, 4], + [5, 4], + [3, 5], + ] + + # Face + Mesh3_face_x = n.createVariable( + "Mesh3_face_x", "f8", ("nMesh3_face",), fill_value=-99 + ) + Mesh3_face_x.standard_name = "longitude" + Mesh3_face_x.units = "degrees_east" + Mesh3_face_x[...] = [-44, -44, -42] + + Mesh3_face_y = n.createVariable( + "Mesh3_face_y", "f8", ("nMesh3_face",), fill_value=-99 + ) + Mesh3_face_y.standard_name = "latitude" + Mesh3_face_y.units = "degrees_north" + Mesh3_face_y[...] = [34, 32, 34] + + Mesh3_face_nodes = n.createVariable( + "Mesh3_face_nodes", + "i4", + ("nMesh3_face", "connectivity4"), + fill_value=-99, + ) + Mesh3_face_nodes.long_name = "Maps every face to its corner nodes" + Mesh3_face_nodes[...] = [[2, 3, 1, 0], [4, 5, 3, 2], [6, 1, 3, -99]] + + Mesh3_face_links = n.createVariable( + "Mesh3_face_links", + "i4", + ("nMesh3_face", "connectivity4"), + fill_value=-99, + ) + Mesh3_face_links.long_name = "neighbour faces for faces" + Mesh3_face_links[...] = [ + [1, 2, -99, -99], + [0, -99, -99, -99], + [0, -99, -99, -99], + ] + + # Edge + Mesh3_edge_x = n.createVariable( + "Mesh3_edge_x", "f8", ("nMesh3_edge",), fill_value=-99 + ) + Mesh3_edge_x.standard_name = "longitude" + Mesh3_edge_x.units = "degrees_east" + Mesh3_edge_x[...] = [-41.5, -41.5, -43, -44, -45, -44, -45, -44, -43] + + Mesh3_edge_y = n.createVariable( + "Mesh3_edge_y", "f8", ("nMesh3_edge",), fill_value=-99 + ) + Mesh3_edge_y.standard_name = "latitude" + Mesh3_edge_y.units = "degrees_north" + Mesh3_edge_y[...] = [34.5, 33.5, 34, 35, 34, 33, 32, 31, 32] + + Mesh3_edge_links = n.createVariable( + "Mesh3_edge_links", + "i4", + ("nMesh3_edge", "connectivity5"), + fill_value=-99, + ) + Mesh3_edge_links.long_name = "neighbour edges for edges" + Mesh3_edge_links[...] = [ + [1, 2, 3, -99, -99], + [0, 2, 5, 8, -99], + [3, 0, 1, 5, 8], + [4, 2, 0, -99, -99], + [ + 3, + 5, + 6, + -99, + -99, + ], + [4, 6, 2, 1, 8], + [ + 4, + 5, + 7, + -99, + -99, + ], + [ + 6, + 8, + -99, + -99, + -99, + ], + [7, 5, 2, 1, -99], + ] + + n.close() + return filename + + def _make_aggregation_value(filename): """Create an aggregation variable with 'unique_values'.""" n = netCDF4.Dataset(filename, "w") @@ -2341,6 +2485,7 @@ def _make_aggregation_value(filename): ugrid_1 = _make_ugrid_1("ugrid_1.nc") ugrid_2 = _make_ugrid_2("ugrid_2.nc") +ugrid_3 = _make_ugrid_3("ugrid_3.nc") aggregation_value = _make_aggregation_value("aggregation_value.nc") diff --git a/cf/test/example_field_0.kerchunk b/cf/test/example_field_0.kerchunk new file mode 100644 index 0000000000..4f8678854c --- /dev/null +++ b/cf/test/example_field_0.kerchunk @@ -0,0 +1 @@ +{"version":1,"refs":{"lat\/0":"base64:eF5jYMABDjQwNBwIcmNwCzpwgMHBAQAxqAWx","lat_bnds\/0.0":"base64:eF5jYCAVNMAZYX5+dkDg5xd2AAQcQAAAZ+II3Q==","lon\/0":"base64:eF5jYCAONDxQ2BDwwUPCLCAmOTOvqNQBCgB9ngjU","lon_bnds\/0.0":"base64:eF5jYKAyaACCBw8aGhQUHjzYsKGBwc0tLCwhIS0tJ6egoLi4jMEBFQAA\/noSOQ==","q\/0.0":["example_field_0.nc",17755,100],"q\/0.1":"base64:eF6z7v4lGfX4l4\/QxJO77ufUsaQsrTNvflvm6f88K8x6dlbngZp9PL2\/9\/DdP7LH8ZT9fKbpK0t0\/RVKEhaaSmnnfZy8qXfD5KeucfrtkxbNmLt8U8WGbbvWttkjAQAjCS4Q","q\/1.0":"base64:eF6TzNB+88v69hsGIGCZ2B5Yd\/9nIIgd9lZsR5Z\/3Q4Qm6\/mptye3iw5EFvX\/lxryfQ9rSD2JtPvryfnlbwGseeuW7yxYtHkhfOAwB4K3IEAAKyvI6g=","q\/1.1":"base64:eF6TtLaeY909x5IBCFju3z9yP+fIFhA7zN9fwf+5wgkQm6+3d0Pv7w1lILbu9OlF01cW7QOxN+XlZeR9zJgPYq\/d1DRx0qKFXfOAwB4K3IEAAGAXIr4=","time\/0":"\u0000\u0000\u0000\u0000\u0000\u0000?@",".zgroup":"{\"zarr_format\":2}",".zattrs":"{\"Conventions\":\"CF-1.12\"}","lat\/.zarray":"{\"shape\":[5],\"chunks\":[5],\"dtype\":\" + # cf.log_level('DISABLE') + + def test_kerchunk_read(self): + """Test cf.read with Kerchunk.""" + f = cf.read(self.netcdf)[0] + + k = cf.read(self.kerchunk, dask_chunks=3) + self.assertEqual(len(k), 1) + self.assertTrue(k[0].equals(f)) + self.assertGreater(k[0].data.npartitions, 1) + + k = cf.read([self.kerchunk, self.kerchunk], dask_chunks=3) + self.assertEqual(len(k), 2) + self.assertTrue(k[0].equals(k[-1])) + + k = cf.read([self.kerchunk, self.kerchunk, self.netcdf], dask_chunks=3) + self.assertEqual(len(k), 3) + self.assertTrue(k[0].equals(k[-1])) + self.assertTrue(k[1].equals(k[-1])) + + def test_kerchunk_original_filenames(self): + """Test original_filenames with Kerchunk.""" + k = cf.read(self.kerchunk)[0] + self.assertEqual(k.get_original_filenames(), set()) + + def test_read_dict(self): + """Test cf.read with an Kerchunk dictionary.""" + with open(kerchunk_file, "r") as fh: + d = json.load(fh) + + with self.assertRaises(ValueError): + cf.read(d) + + fs = fsspec.filesystem("reference", fo=d) + kerchunk = fs.get_mapper() + self.assertEqual(len(cf.read(kerchunk)), 1) + + def test_read_bytes(self): + """Test cf.read with an Kerchunk dictionary.""" + with open(kerchunk_file, "r") as fh: + d = json.load(fh) + + b = json.dumps(d).encode("utf-8") + with self.assertRaises(ValueError): + cf.read(b) + + d = json.loads(b) + fs = fsspec.filesystem("reference", fo=d) + kerchunk = fs.get_mapper() + self.assertEqual(len(cf.read(kerchunk)), 1) + + +if __name__ == "__main__": + print("Run date:", datetime.datetime.now()) + cf.environment() + print("") + unittest.main(verbosity=2) diff --git a/cf/test/test_pp.py b/cf/test/test_pp.py index 08a85a4eef..d5812deca8 100644 --- a/cf/test/test_pp.py +++ b/cf/test/test_pp.py @@ -145,6 +145,15 @@ def test_PP_um_version(self): f = cf.read(self.ppfile, um={"version": "6.6.3"})[0] self.assertEqual(f.get_property("um_version"), "6.6.3") + def test_PP_file_object(self): + # Can't yet read PP/UM from file-like objects + with open(self.ppfile, "rb") as fh: + with self.assertRaises(NotImplementedError): + cf.read(fh) + + # Check that the file has been rewound + self.assertEqual(fh.tell(), 0) + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) diff --git a/cf/test/test_read_write.py b/cf/test/test_read_write.py index 2f80c858e6..1f3b60a122 100644 --- a/cf/test/test_read_write.py +++ b/cf/test/test_read_write.py @@ -347,9 +347,8 @@ def test_write_netcdf_mode(self): if fmt == "NETCDF4_CLASSIC" and ex_field_n in (6, 7): continue - print( - "TODOUGRID: excluding example fields 8, 9, 10 until writing UGRID is enabled" - ) + # Exclude UGRID fields, as we deal with them in + # test_UGRID.py if ex_field_n in (8, 9, 10): continue @@ -427,9 +426,9 @@ def test_write_netcdf_mode(self): # Now do the same test, but appending all of the example fields in # one operation rather than one at a time, to check that it works. cf.write(g, tmpfile, fmt=fmt, mode="w") # 1. overwrite to wipe - print( - "TODOUGRID: excluding example fields 8, 9, 10 until writing UGRID is enabled" - ) + + # Exclude UGRID fields, as we deal with them in + # test_UGRID.py append_ex_fields = cf.example_fields(0, 1, 2, 3, 4, 5, 6, 7) del append_ex_fields[1] # note: can remove after Issue #141 closed if fmt in "NETCDF4_CLASSIC": diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 976c717123..e7f4eafa0f 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -227,6 +227,9 @@ Required * `scipy `_, version 1.10.0 or newer. +* `fsspec `_, version 2026.2.0 or + newer. + * `cfdm `_, version 1.13.1.0 or up to, but not including, 1.13.2.0. diff --git a/docs/source/introduction.rst b/docs/source/introduction.rst index d1bbfc4dd7..baa4fc62ae 100644 --- a/docs/source/introduction.rst +++ b/docs/source/introduction.rst @@ -67,8 +67,8 @@ may nonetheless be modified in memory. The `cf` package can: * read :term:`field constructs ` and :term:`domain - constructs ` from netCDF, CDL, Zarr, PP and UM - datasets with a choice of netCDF backends, + constructs ` from netCDF, CDL, Zarr, Kerchunk, PP + and UM datasets with a choice of netCDF backends, * read files from OPeNDAP servers and S3 object stores, diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst index a696524e8f..6e9ff6697e 100644 --- a/docs/source/tutorial.rst +++ b/docs/source/tutorial.rst @@ -139,15 +139,15 @@ The following file types can be read: .. -* `CFA-netCDF - `_ - files at version 0.6 or later. +* Datasets in `Kerchunk `_ format. .. * :ref:`PP and UM fields files `, whose contents are mapped into field constructs. +.. + Note that when reading netCDF4 files that contain :ref:`hierachical groups `, the group structure is saved via the :ref:`netCDF interface ` so that it may be re-used, diff --git a/setup.py b/setup.py index 5d388cb0f9..0c4d0ba6c5 100755 --- a/setup.py +++ b/setup.py @@ -178,7 +178,7 @@ def compile(): The ``cf`` package can: -* read field and domain constructs from netCDF, CDL, Zarr, PP and UM datasets, +* read field and domain constructs from netCDF, CDL, Zarr, Kerchunk, PP and UM datasets, * be fully flexible with respect to dataset storage chunking,