Source code for tailestim.datasets

import logging
import os

import numpy as np


[docs] class TailData: """Load and manage tail distribution datasets. This class provides functionality to load datasets either from the package's built-in data directory using a name, or from a custom path provided by the user. Parameters ---------- name : str, optional Name of a built-in dataset to load (without file extension). Must be provided if `path` is None. path : str, optional Path to a custom dataset file. If provided, this takes precedence over `name`. Must be provided if `name` is None. Attributes ---------- name : str or None Name of the dataset if a built-in dataset was loaded. path : str or None Path to the dataset file if a custom dataset was loaded. data : numpy.ndarray The loaded dataset as a numpy array. Examples -------- Load a built-in dataset: >>> data = TailData(name='CAIDA_KONECT') >>> print(len(data.data)) Load a custom dataset: >>> data = TailData(path='path/to/my/data.dat') >>> print(len(data.data)) """ def __init__(self, name=None, path=None): if name is None and path is None: raise ValueError("Either 'name' or 'path' must be provided") if name is not None and path is not None: logging.info("Both 'name' and 'path' provided; 'path' will take precedence") self.name = name self.path = path self.data = self.load_data()
[docs] def load_data(self): """Load data from either a built-in dataset or a custom file path. Returns ------- numpy.ndarray The loaded dataset as a numpy array. Raises ------ FileNotFoundError If the specified dataset file cannot be found. """ # Determine the file path based on whether name or path was provided if self.path is not None: # Use the provided custom path file_path = self.path logging.info(f"Using custom path: {file_path}") else: # Use the package data directory with the provided name data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") file_path = os.path.join(data_dir, f"{self.name}.dat") logging.info(f"Using package data path: {file_path}") # Check if the file exists if not os.path.exists(file_path): if self.path is not None: raise FileNotFoundError(f"Data file not found at path: {file_path}") else: raise FileNotFoundError( f"Data file '{self.name}.dat' not found in package data directory." ) # Load the data from the file using the provided method logging.info(f"Loading data from file: {file_path}") with open(file_path) as file: lines = file.readlines() # Determine the total number of data points N = sum(int(line.strip().split()[1]) for line in lines) ordered_data = np.zeros(N) current_index = 0 # Populate the ordered_data array for line in lines: degree, count = line.strip().split() ordered_data[current_index : current_index + int(count)] = float(degree) current_index += int(count) return ordered_data
[docs] def __repr__(self): """Return a string representation of the TailData object. Returns ------- str String representation including the data source and length. """ if self.path is not None: return f"TailData(path='{self.path}', data_length={len(self.data)})" else: return f"TailData(name='{self.name}', data_length={len(self.data)})"