_arffread.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885
  1. # Last Change: Mon Aug 20 08:00 PM 2007 J
  2. import re
  3. import datetime
  4. import numpy as np
  5. import csv
  6. import ctypes
  7. """A module to read arff files."""
  8. __all__ = ['MetaData', 'loadarff', 'ArffError', 'ParseArffError']
  9. # An Arff file is basically two parts:
  10. # - header
  11. # - data
  12. #
  13. # A header has each of its components starting by @META where META is one of
  14. # the keyword (attribute of relation, for now).
  15. # TODO:
  16. # - both integer and reals are treated as numeric -> the integer info
  17. # is lost!
  18. # - Replace ValueError by ParseError or something
  19. # We know can handle the following:
  20. # - numeric and nominal attributes
  21. # - missing values for numeric attributes
  22. r_meta = re.compile(r'^\s*@')
  23. # Match a comment
  24. r_comment = re.compile(r'^%')
  25. # Match an empty line
  26. r_empty = re.compile(r'^\s+$')
  27. # Match a header line, that is a line which starts by @ + a word
  28. r_headerline = re.compile(r'^\s*@\S*')
  29. r_datameta = re.compile(r'^@[Dd][Aa][Tt][Aa]')
  30. r_relation = re.compile(r'^@[Rr][Ee][Ll][Aa][Tt][Ii][Oo][Nn]\s*(\S*)')
  31. r_attribute = re.compile(r'^\s*@[Aa][Tt][Tt][Rr][Ii][Bb][Uu][Tt][Ee]\s*(..*$)')
  32. r_nominal = re.compile(r'{(.+)}')
  33. r_date = re.compile(r"[Dd][Aa][Tt][Ee]\s+[\"']?(.+?)[\"']?$")
  34. # To get attributes name enclosed with ''
  35. r_comattrval = re.compile(r"'(..+)'\s+(..+$)")
  36. # To get normal attributes
  37. r_wcomattrval = re.compile(r"(\S+)\s+(..+$)")
  38. # ------------------------
  39. # Module defined exception
  40. # ------------------------
  41. class ArffError(OSError):
  42. """
  43. Base exception for errors when reading ARFF files.
  44. Raised when an ARFF file cannot be read due to file access issues,
  45. corruption, or unsupported features.
  46. """
  47. pass
  48. class ParseArffError(ArffError):
  49. """
  50. Exception for syntax and parsing errors in ARFF files.
  51. Raised when an ARFF file has invalid syntax, malformed attributes,
  52. or data that doesn't match the expected format.
  53. """
  54. pass
  55. # ----------
  56. # Attributes
  57. # ----------
  58. class Attribute:
  59. type_name = None
  60. def __init__(self, name):
  61. self.name = name
  62. self.range = None
  63. self.dtype = np.object_
  64. @classmethod
  65. def parse_attribute(cls, name, attr_string):
  66. """
  67. Parse the attribute line if it knows how. Returns the parsed
  68. attribute, or None.
  69. """
  70. return None
  71. def parse_data(self, data_str):
  72. """
  73. Parse a value of this type.
  74. """
  75. return None
  76. def __str__(self):
  77. """
  78. Parse a value of this type.
  79. """
  80. return self.name + ',' + self.type_name
  81. class NominalAttribute(Attribute):
  82. type_name = 'nominal'
  83. def __init__(self, name, values):
  84. super().__init__(name)
  85. self.values = values
  86. self.range = values
  87. self.dtype = (np.bytes_, max(len(i) for i in values))
  88. @staticmethod
  89. def _get_nom_val(atrv):
  90. """Given a string containing a nominal type, returns a tuple of the
  91. possible values.
  92. A nominal type is defined as something framed between braces ({}).
  93. Parameters
  94. ----------
  95. atrv : str
  96. Nominal type definition
  97. Returns
  98. -------
  99. poss_vals : tuple
  100. possible values
  101. Examples
  102. --------
  103. >>> from scipy.io.arff._arffread import NominalAttribute
  104. >>> NominalAttribute._get_nom_val("{floup, bouga, fl, ratata}")
  105. ('floup', 'bouga', 'fl', 'ratata')
  106. """
  107. m = r_nominal.match(atrv)
  108. if m:
  109. attrs, _ = split_data_line(m.group(1))
  110. return tuple(attrs)
  111. else:
  112. raise ValueError("This does not look like a nominal string")
  113. @classmethod
  114. def parse_attribute(cls, name, attr_string):
  115. """
  116. Parse the attribute line if it knows how. Returns the parsed
  117. attribute, or None.
  118. For nominal attributes, the attribute string would be like '{<attr_1>,
  119. <attr2>, <attr_3>}'.
  120. """
  121. if attr_string[0] == '{':
  122. values = cls._get_nom_val(attr_string)
  123. return cls(name, values)
  124. else:
  125. return None
  126. def parse_data(self, data_str):
  127. """
  128. Parse a value of this type.
  129. """
  130. if data_str in self.values:
  131. return data_str
  132. elif data_str == '?':
  133. return data_str
  134. else:
  135. raise ValueError(f"{str(data_str)} value not in {str(self.values)}")
  136. def __str__(self):
  137. msg = self.name + ",{"
  138. for i in range(len(self.values)-1):
  139. msg += self.values[i] + ","
  140. msg += self.values[-1]
  141. msg += "}"
  142. return msg
  143. class NumericAttribute(Attribute):
  144. def __init__(self, name):
  145. super().__init__(name)
  146. self.type_name = 'numeric'
  147. self.dtype = np.float64
  148. @classmethod
  149. def parse_attribute(cls, name, attr_string):
  150. """
  151. Parse the attribute line if it knows how. Returns the parsed
  152. attribute, or None.
  153. For numeric attributes, the attribute string would be like
  154. 'numeric' or 'int' or 'real'.
  155. """
  156. attr_string = attr_string.lower().strip()
  157. if (attr_string[:len('numeric')] == 'numeric' or
  158. attr_string[:len('int')] == 'int' or
  159. attr_string[:len('real')] == 'real'):
  160. return cls(name)
  161. else:
  162. return None
  163. def parse_data(self, data_str):
  164. """
  165. Parse a value of this type.
  166. Parameters
  167. ----------
  168. data_str : str
  169. string to convert
  170. Returns
  171. -------
  172. f : float
  173. where float can be nan
  174. Examples
  175. --------
  176. >>> from scipy.io.arff._arffread import NumericAttribute
  177. >>> atr = NumericAttribute('atr')
  178. >>> atr.parse_data('1')
  179. 1.0
  180. >>> atr.parse_data('1\\n')
  181. 1.0
  182. >>> atr.parse_data('?\\n')
  183. nan
  184. """
  185. if '?' in data_str:
  186. return np.nan
  187. else:
  188. return float(data_str)
  189. def _basic_stats(self, data):
  190. nbfac = data.size * 1. / (data.size - 1)
  191. return (np.nanmin(data), np.nanmax(data),
  192. np.mean(data), np.std(data) * nbfac)
  193. class StringAttribute(Attribute):
  194. def __init__(self, name):
  195. super().__init__(name)
  196. self.type_name = 'string'
  197. @classmethod
  198. def parse_attribute(cls, name, attr_string):
  199. """
  200. Parse the attribute line if it knows how. Returns the parsed
  201. attribute, or None.
  202. For string attributes, the attribute string would be like
  203. 'string'.
  204. """
  205. attr_string = attr_string.lower().strip()
  206. if attr_string[:len('string')] == 'string':
  207. return cls(name)
  208. else:
  209. return None
  210. class DateAttribute(Attribute):
  211. def __init__(self, name, date_format, datetime_unit):
  212. super().__init__(name)
  213. self.date_format = date_format
  214. self.datetime_unit = datetime_unit
  215. self.type_name = 'date'
  216. self.range = date_format
  217. self.dtype = np.datetime64(0, self.datetime_unit)
  218. @staticmethod
  219. def _get_date_format(atrv):
  220. m = r_date.match(atrv)
  221. if m:
  222. pattern = m.group(1).strip()
  223. # convert time pattern from Java's SimpleDateFormat to C's format
  224. datetime_unit = None
  225. if "yyyy" in pattern:
  226. pattern = pattern.replace("yyyy", "%Y")
  227. datetime_unit = "Y"
  228. elif "yy":
  229. pattern = pattern.replace("yy", "%y")
  230. datetime_unit = "Y"
  231. if "MM" in pattern:
  232. pattern = pattern.replace("MM", "%m")
  233. datetime_unit = "M"
  234. if "dd" in pattern:
  235. pattern = pattern.replace("dd", "%d")
  236. datetime_unit = "D"
  237. if "HH" in pattern:
  238. pattern = pattern.replace("HH", "%H")
  239. datetime_unit = "h"
  240. if "mm" in pattern:
  241. pattern = pattern.replace("mm", "%M")
  242. datetime_unit = "m"
  243. if "ss" in pattern:
  244. pattern = pattern.replace("ss", "%S")
  245. datetime_unit = "s"
  246. if "z" in pattern or "Z" in pattern:
  247. raise ValueError("Date type attributes with time zone not "
  248. "supported, yet")
  249. if datetime_unit is None:
  250. raise ValueError("Invalid or unsupported date format")
  251. return pattern, datetime_unit
  252. else:
  253. raise ValueError("Invalid or no date format")
  254. @classmethod
  255. def parse_attribute(cls, name, attr_string):
  256. """
  257. Parse the attribute line if it knows how. Returns the parsed
  258. attribute, or None.
  259. For date attributes, the attribute string would be like
  260. 'date <format>'.
  261. """
  262. attr_string_lower = attr_string.lower().strip()
  263. if attr_string_lower[:len('date')] == 'date':
  264. date_format, datetime_unit = cls._get_date_format(attr_string)
  265. return cls(name, date_format, datetime_unit)
  266. else:
  267. return None
  268. def parse_data(self, data_str):
  269. """
  270. Parse a value of this type.
  271. """
  272. date_str = data_str.strip().strip("'").strip('"')
  273. if date_str == '?':
  274. return np.datetime64('NaT', self.datetime_unit)
  275. else:
  276. dt = datetime.datetime.strptime(date_str, self.date_format)
  277. return np.datetime64(dt).astype(
  278. f"datetime64[{self.datetime_unit}]")
  279. def __str__(self):
  280. return super().__str__() + ',' + self.date_format
  281. class RelationalAttribute(Attribute):
  282. def __init__(self, name):
  283. super().__init__(name)
  284. self.type_name = 'relational'
  285. self.dtype = np.object_
  286. self.attributes = []
  287. self.dialect = None
  288. @classmethod
  289. def parse_attribute(cls, name, attr_string):
  290. """
  291. Parse the attribute line if it knows how. Returns the parsed
  292. attribute, or None.
  293. For date attributes, the attribute string would be like
  294. 'date <format>'.
  295. """
  296. attr_string_lower = attr_string.lower().strip()
  297. if attr_string_lower[:len('relational')] == 'relational':
  298. return cls(name)
  299. else:
  300. return None
  301. def parse_data(self, data_str):
  302. # Copy-pasted
  303. elems = list(range(len(self.attributes)))
  304. escaped_string = data_str.encode().decode("unicode-escape")
  305. row_tuples = []
  306. for raw in escaped_string.split("\n"):
  307. row, self.dialect = split_data_line(raw, self.dialect)
  308. row_tuples.append(tuple(
  309. [self.attributes[i].parse_data(row[i]) for i in elems]))
  310. return np.array(row_tuples,
  311. [(a.name, a.dtype) for a in self.attributes])
  312. def __str__(self):
  313. return (super().__str__() + '\n\t' +
  314. '\n\t'.join(str(a) for a in self.attributes))
  315. # -----------------
  316. # Various utilities
  317. # -----------------
  318. def to_attribute(name, attr_string):
  319. attr_classes = (NominalAttribute, NumericAttribute, DateAttribute,
  320. StringAttribute, RelationalAttribute)
  321. for cls in attr_classes:
  322. attr = cls.parse_attribute(name, attr_string)
  323. if attr is not None:
  324. return attr
  325. raise ParseArffError(f"unknown attribute {attr_string}")
  326. def csv_sniffer_has_bug_last_field():
  327. """
  328. Checks if the bug https://bugs.python.org/issue30157 is unpatched.
  329. """
  330. # We only compute this once.
  331. has_bug = getattr(csv_sniffer_has_bug_last_field, "has_bug", None)
  332. if has_bug is None:
  333. dialect = csv.Sniffer().sniff("3, 'a'")
  334. csv_sniffer_has_bug_last_field.has_bug = dialect.quotechar != "'"
  335. has_bug = csv_sniffer_has_bug_last_field.has_bug
  336. return has_bug
  337. def workaround_csv_sniffer_bug_last_field(sniff_line, dialect, delimiters):
  338. """
  339. Workaround for the bug https://bugs.python.org/issue30157 if is unpatched.
  340. """
  341. if csv_sniffer_has_bug_last_field():
  342. # Reuses code from the csv module
  343. right_regex = r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)' # noqa: E501
  344. for restr in (r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?", # noqa: E501
  345. r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # .*?", # noqa: E501
  346. right_regex, # ,".*?"
  347. r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space) # noqa: E501
  348. regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
  349. matches = regexp.findall(sniff_line)
  350. if matches:
  351. break
  352. # If it does not match the expression that was bugged,
  353. # then this bug does not apply
  354. if restr != right_regex:
  355. return
  356. groupindex = regexp.groupindex
  357. # There is only one end of the string
  358. assert len(matches) == 1
  359. m = matches[0]
  360. n = groupindex['quote'] - 1
  361. quote = m[n]
  362. n = groupindex['delim'] - 1
  363. delim = m[n]
  364. n = groupindex['space'] - 1
  365. space = bool(m[n])
  366. dq_regexp = re.compile(
  367. rf"(({re.escape(delim)})|^)\W*{quote}[^{re.escape(delim)}\n]*{quote}[^{re.escape(delim)}\n]*{quote}\W*(({re.escape(delim)})|$)", re.MULTILINE # noqa: E501
  368. )
  369. doublequote = bool(dq_regexp.search(sniff_line))
  370. dialect.quotechar = quote
  371. if delim in delimiters:
  372. dialect.delimiter = delim
  373. dialect.doublequote = doublequote
  374. dialect.skipinitialspace = space
  375. def split_data_line(line, dialect=None):
  376. delimiters = ",\t"
  377. # This can not be done in a per reader basis, and relational fields
  378. # can be HUGE
  379. csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2))
  380. # Remove the line end if any
  381. if line[-1] == '\n':
  382. line = line[:-1]
  383. # Remove potential trailing whitespace
  384. line = line.strip()
  385. sniff_line = line
  386. # Add a delimiter if none is present, so that the csv.Sniffer
  387. # does not complain for a single-field CSV.
  388. if not any(d in line for d in delimiters):
  389. sniff_line += ","
  390. if dialect is None:
  391. dialect = csv.Sniffer().sniff(sniff_line, delimiters=delimiters)
  392. workaround_csv_sniffer_bug_last_field(sniff_line=sniff_line,
  393. dialect=dialect,
  394. delimiters=delimiters)
  395. row = next(csv.reader([line], dialect))
  396. return row, dialect
  397. # --------------
  398. # Parsing header
  399. # --------------
  400. def tokenize_attribute(iterable, attribute):
  401. """Parse a raw string in header (e.g., starts by @attribute).
  402. Given a raw string attribute, try to get the name and type of the
  403. attribute. Constraints:
  404. * The first line must start with @attribute (case insensitive, and
  405. space like characters before @attribute are allowed)
  406. * Works also if the attribute is spread on multilines.
  407. * Works if empty lines or comments are in between
  408. Parameters
  409. ----------
  410. attribute : str
  411. the attribute string.
  412. Returns
  413. -------
  414. name : str
  415. name of the attribute
  416. value : str
  417. value of the attribute
  418. next : str
  419. next line to be parsed
  420. Examples
  421. --------
  422. If attribute is a string defined in python as r"floupi real", will
  423. return floupi as name, and real as value.
  424. >>> from scipy.io.arff._arffread import tokenize_attribute
  425. >>> iterable = iter([0] * 10) # dummy iterator
  426. >>> tokenize_attribute(iterable, r"@attribute floupi real")
  427. ('floupi', 'real', 0)
  428. If attribute is r"'floupi 2' real", will return 'floupi 2' as name,
  429. and real as value.
  430. >>> tokenize_attribute(iterable, r" @attribute 'floupi 2' real ")
  431. ('floupi 2', 'real', 0)
  432. """
  433. sattr = attribute.strip()
  434. mattr = r_attribute.match(sattr)
  435. if mattr:
  436. # atrv is everything after @attribute
  437. atrv = mattr.group(1)
  438. if r_comattrval.match(atrv):
  439. name, type = tokenize_single_comma(atrv)
  440. next_item = next(iterable)
  441. elif r_wcomattrval.match(atrv):
  442. name, type = tokenize_single_wcomma(atrv)
  443. next_item = next(iterable)
  444. else:
  445. # Not sure we should support this, as it does not seem supported by
  446. # weka.
  447. raise ValueError("multi line not supported yet")
  448. else:
  449. raise ValueError(f"First line unparsable: {sattr}")
  450. attribute = to_attribute(name, type)
  451. if type.lower() == 'relational':
  452. next_item = read_relational_attribute(iterable, attribute, next_item)
  453. # raise ValueError("relational attributes not supported yet")
  454. return attribute, next_item
  455. def tokenize_single_comma(val):
  456. # XXX we match twice the same string (here and at the caller level). It is
  457. # stupid, but it is easier for now...
  458. m = r_comattrval.match(val)
  459. if m:
  460. try:
  461. name = m.group(1).strip()
  462. type = m.group(2).strip()
  463. except IndexError as e:
  464. raise ValueError("Error while tokenizing attribute") from e
  465. else:
  466. raise ValueError(f"Error while tokenizing single {val}")
  467. return name, type
  468. def tokenize_single_wcomma(val):
  469. # XXX we match twice the same string (here and at the caller level). It is
  470. # stupid, but it is easier for now...
  471. m = r_wcomattrval.match(val)
  472. if m:
  473. try:
  474. name = m.group(1).strip()
  475. type = m.group(2).strip()
  476. except IndexError as e:
  477. raise ValueError("Error while tokenizing attribute") from e
  478. else:
  479. raise ValueError(f"Error while tokenizing single {val}")
  480. return name, type
  481. def read_relational_attribute(ofile, relational_attribute, i):
  482. """Read the nested attributes of a relational attribute"""
  483. r_end_relational = re.compile(r'^@[Ee][Nn][Dd]\s*' +
  484. relational_attribute.name + r'\s*$')
  485. while not r_end_relational.match(i):
  486. m = r_headerline.match(i)
  487. if m:
  488. isattr = r_attribute.match(i)
  489. if isattr:
  490. attr, i = tokenize_attribute(ofile, i)
  491. relational_attribute.attributes.append(attr)
  492. else:
  493. raise ValueError(f"Error parsing line {i}")
  494. else:
  495. i = next(ofile)
  496. i = next(ofile)
  497. return i
  498. def read_header(ofile):
  499. """Read the header of the iterable ofile."""
  500. i = next(ofile)
  501. # Pass first comments
  502. while r_comment.match(i):
  503. i = next(ofile)
  504. # Header is everything up to DATA attribute ?
  505. relation = None
  506. attributes = []
  507. while not r_datameta.match(i):
  508. m = r_headerline.match(i)
  509. if m:
  510. isattr = r_attribute.match(i)
  511. if isattr:
  512. attr, i = tokenize_attribute(ofile, i)
  513. attributes.append(attr)
  514. else:
  515. isrel = r_relation.match(i)
  516. if isrel:
  517. relation = isrel.group(1)
  518. else:
  519. raise ValueError(f"Error parsing line {i}")
  520. i = next(ofile)
  521. else:
  522. i = next(ofile)
  523. return relation, attributes
  524. class MetaData:
  525. """Small container to keep useful information on a ARFF dataset.
  526. Knows about attributes names and types.
  527. Examples
  528. --------
  529. ::
  530. data, meta = loadarff('iris.arff')
  531. # This will print the attributes names of the iris.arff dataset
  532. for i in meta:
  533. print(i)
  534. # This works too
  535. meta.names()
  536. # Getting attribute type
  537. types = meta.types()
  538. Methods
  539. -------
  540. names
  541. types
  542. Notes
  543. -----
  544. Also maintains the list of attributes in order, i.e., doing for i in
  545. meta, where meta is an instance of MetaData, will return the
  546. different attribute names in the order they were defined.
  547. """
  548. def __init__(self, rel, attr):
  549. self.name = rel
  550. self._attributes = {a.name: a for a in attr}
  551. def __repr__(self):
  552. msg = ""
  553. msg += f"Dataset: {self.name}\n"
  554. for i in self._attributes:
  555. msg += f"\t{i}'s type is {self._attributes[i].type_name}"
  556. if self._attributes[i].range:
  557. msg += f", range is {str(self._attributes[i].range)}"
  558. msg += '\n'
  559. return msg
  560. def __iter__(self):
  561. return iter(self._attributes)
  562. def __getitem__(self, key):
  563. attr = self._attributes[key]
  564. return (attr.type_name, attr.range)
  565. def names(self):
  566. """Return the list of attribute names.
  567. Returns
  568. -------
  569. attrnames : list of str
  570. The attribute names.
  571. """
  572. return list(self._attributes)
  573. def types(self):
  574. """Return the list of attribute types.
  575. Returns
  576. -------
  577. attr_types : list of str
  578. The attribute types.
  579. """
  580. attr_types = [self._attributes[name].type_name
  581. for name in self._attributes]
  582. return attr_types
  583. def loadarff(f):
  584. """
  585. Read an arff file.
  586. The data is returned as a record array, which can be accessed much like
  587. a dictionary of NumPy arrays. For example, if one of the attributes is
  588. called 'pressure', then its first 10 data points can be accessed from the
  589. ``data`` record array like so: ``data['pressure'][0:10]``
  590. Parameters
  591. ----------
  592. f : file-like or str
  593. File-like object to read from, or filename to open.
  594. Returns
  595. -------
  596. data : record array
  597. The data of the arff file, accessible by attribute names.
  598. meta : `MetaData`
  599. Contains information about the arff file such as name and
  600. type of attributes, the relation (name of the dataset), etc.
  601. Raises
  602. ------
  603. ParseArffError
  604. This is raised if the given file is not ARFF-formatted.
  605. NotImplementedError
  606. The ARFF file has an attribute which is not supported yet.
  607. Notes
  608. -----
  609. This function should be able to read most arff files. Not
  610. implemented functionality include:
  611. * date type attributes
  612. * string type attributes
  613. It can read files with numeric and nominal attributes. It cannot read
  614. files with sparse data ({} in the file). However, this function can
  615. read files with missing data (? in the file), representing the data
  616. points as NaNs.
  617. Examples
  618. --------
  619. >>> from scipy.io import arff
  620. >>> from io import StringIO
  621. >>> content = \"\"\"
  622. ... @relation foo
  623. ... @attribute width numeric
  624. ... @attribute height numeric
  625. ... @attribute color {red,green,blue,yellow,black}
  626. ... @data
  627. ... 5.0,3.25,blue
  628. ... 4.5,3.75,green
  629. ... 3.0,4.00,red
  630. ... \"\"\"
  631. >>> f = StringIO(content)
  632. >>> data, meta = arff.loadarff(f)
  633. >>> data
  634. array([(5.0, 3.25, 'blue'), (4.5, 3.75, 'green'), (3.0, 4.0, 'red')],
  635. dtype=[('width', '<f8'), ('height', '<f8'), ('color', '|S6')])
  636. >>> meta
  637. Dataset: foo
  638. \twidth's type is numeric
  639. \theight's type is numeric
  640. \tcolor's type is nominal, range is ('red', 'green', 'blue', 'yellow', 'black')
  641. """
  642. if hasattr(f, 'read'):
  643. ofile = f
  644. else:
  645. ofile = open(f)
  646. try:
  647. return _loadarff(ofile)
  648. finally:
  649. if ofile is not f: # only close what we opened
  650. ofile.close()
  651. def _loadarff(ofile):
  652. # Parse the header file
  653. try:
  654. rel, attr = read_header(ofile)
  655. except ValueError as e:
  656. msg = "Error while parsing header, error was: " + str(e)
  657. raise ParseArffError(msg) from e
  658. # Check whether we have a string attribute (not supported yet)
  659. hasstr = False
  660. for a in attr:
  661. if isinstance(a, StringAttribute):
  662. hasstr = True
  663. meta = MetaData(rel, attr)
  664. # XXX The following code is not great
  665. # Build the type descriptor descr and the list of converters to convert
  666. # each attribute to the suitable type (which should match the one in
  667. # descr).
  668. # This can be used once we want to support integer as integer values and
  669. # not as numeric anymore (using masked arrays ?).
  670. if hasstr:
  671. # How to support string efficiently ? Ideally, we should know the max
  672. # size of the string before allocating the numpy array.
  673. raise NotImplementedError("String attributes not supported yet, sorry")
  674. ni = len(attr)
  675. def generator(row_iter, delim=','):
  676. # TODO: this is where we are spending time (~80%). I think things
  677. # could be made more efficiently:
  678. # - We could for example "compile" the function, because some values
  679. # do not change here.
  680. # - The function to convert a line to dtyped values could also be
  681. # generated on the fly from a string and be executed instead of
  682. # looping.
  683. # - The regex are overkill: for comments, checking that a line starts
  684. # by % should be enough and faster, and for empty lines, same thing
  685. # --> this does not seem to change anything.
  686. # 'compiling' the range since it does not change
  687. # Note, I have already tried zipping the converters and
  688. # row elements and got slightly worse performance.
  689. elems = list(range(ni))
  690. dialect = None
  691. for raw in row_iter:
  692. # We do not abstract skipping comments and empty lines for
  693. # performance reasons.
  694. if r_comment.match(raw) or r_empty.match(raw):
  695. continue
  696. row, dialect = split_data_line(raw, dialect)
  697. yield tuple([attr[i].parse_data(row[i]) for i in elems])
  698. a = list(generator(ofile))
  699. # No error should happen here: it is a bug otherwise
  700. data = np.array(a, [(a.name, a.dtype) for a in attr])
  701. return data, meta