Source code for datumaro.cli.commands.filter
# Copyright (C) 2020-2021 Intel Corporation
#
# SPDX-License-Identifier: MIT
import argparse
import logging as log
import os
import os.path as osp
from datumaro.components.filter import DatasetItemEncoder
from datumaro.util.scope import scoped
from ..util import MultilineFormatter
from ..util.dataset_utils import FilterModes, parse_dataset_pathspec
from ..util.errors import CliException
[docs]
def build_parser(parser_ctor=argparse.ArgumentParser):
parser = parser_ctor(
help="Extract subdataset",
description="""
Extracts a subdataset that contains only items matching filter.|n
|n
By default, datasets are updated in-place. The '-o/--output-dir'
option can be used to specify another output directory. When
updating in-place, use the '--overwrite' parameter.|n
|n
A filter is an XPath expression, which is applied to XML
representation of a dataset item. Check '--dry-run' parameter
to see XML representations of the dataset items.|n
|n
To filter annotations use the mode ('-m') parameter.|n
Supported modes:|n
- 'i', 'items'|n
- 'a', 'annotations'|n
- 'i+a', 'a+i', 'items+annotations', 'annotations+items'|n
When filtering annotations, use the 'items+annotations'
mode to point that annotation-less dataset items should be
removed. To select an annotation, write an XPath that
returns 'annotation' elements (see examples).|n
|n
Usage: %(prog)s <dataset_path>|n
|n
<dataset_path> - dataset path with optional format:|n
|s|s- <dataset path>[ :<format> ]|n
|n
Examples:|n
- Filter images with width < height:|n
|s|s%(prog)s -e '/item[image/width < image/height]' dataset/|n
|n
- Filter images with large-area bboxes:|n
|s|s%(prog)s -e '/item[annotation/type="bbox" and
annotation/area>2000]' dataset/|n
|n
- Filter out all irrelevant annotations from items:|n
|s|s%(prog)s -m a -e '/item/annotation[label = "person"]' dataset/|n
|n
- Filter out all irrelevant annotations from items:|n
|s|s%(prog)s -m a -e '/item/annotation[label="cat" and
area > 99.5]' dataset/|n
|n
- Filter occluded annotations and items, if no annotations left:|n
|s|s%(prog)s -m i+a -e '/item/annotation[occluded="True"]' dataset/|n
|n
- Filter a VOC-like dataset inplace:|n
|s|s%(prog)s -e '/item/annotation[label = "bus"]' --overwrite dataset/:voc
""",
formatter_class=MultilineFormatter,
)
parser.add_argument(
"target",
help="Target dataset path with optional format (e.g., 'dataset/' or 'dataset/:voc')",
)
parser.add_argument("-e", "--filter", help="XML XPath filter expression for dataset items")
parser.add_argument(
"-m",
"--mode",
default=FilterModes.i.name,
type=FilterModes.parse,
help="Filter mode (options: %s; default: %s)"
% (", ".join(FilterModes.list_options()), "%(default)s"),
)
parser.add_argument(
"--dry-run", action="store_true", help="Print XML representations to be filtered and exit"
)
parser.add_argument(
"-o",
"--output-dir",
dest="dst_dir",
help="Output directory. If not specified, the results will be saved inplace.",
)
parser.add_argument(
"--overwrite", action="store_true", help="Overwrite existing files in the save directory"
)
parser.set_defaults(command=filter_command)
return parser
[docs]
@scoped
def filter_command(args):
filter_args = FilterModes.make_filter_args(args.mode)
filter_expr = args.filter
try:
dataset = parse_dataset_pathspec(args.target)
except Exception as e:
raise CliException(str(e))
if args.dry_run:
dataset = dataset.filter(filter_expr, **filter_args)
for item in dataset:
encoded_item = DatasetItemEncoder.encode(item, dataset.categories())
xml_item = DatasetItemEncoder.to_string(encoded_item)
print(xml_item)
return 0
if not args.filter:
raise CliException("Expected a filter expression ('-e' argument)")
log.info("Filtering...")
dst_dir = args.dst_dir or dataset.data_path
if not args.overwrite and osp.isdir(dst_dir) and os.listdir(dst_dir):
raise CliException(
"Directory '%s' already exists " "(pass --overwrite to overwrite)" % dst_dir
)
dst_dir = osp.abspath(dst_dir)
dataset = dataset.filter(filter_expr, **filter_args)
dataset.save(dst_dir, save_media=True)
log.info("Results have been saved to '%s'" % dst_dir)
return 0