Skip to content

Commit 409dbe7

Browse files
committed
feat: allow disabling network and/or storage metric collection only
Since not all metrics are likely to be of the same relevance, it would be beneficial from a cost perspective to allow disabling some of them, namely Network IO and Block IO. This change introduces the ability to disable network and storage stats collection via ECS_DISABLE_NETWORK_METRICS and ECS_DISABLE_STORAGE_METRICS respectively.
1 parent 12cb750 commit 409dbe7

File tree

10 files changed

+111
-66
lines changed

10 files changed

+111
-66
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,8 @@ additional details on each available environment variable.
186186
| `ECS_DATADIR` | /data/ | The container path where state is checkpointed for use across agent restarts. Note that on Linux, when you specify this, you will need to make sure that the Agent container has a bind mount of `$ECS_HOST_DATA_DIR/data:$ECS_DATADIR` with the corresponding values of `ECS_HOST_DATA_DIR` and `ECS_DATADIR`. | /data/ | `C:\ProgramData\Amazon\ECS\data`
187187
| `ECS_UPDATES_ENABLED` | <true | false> | Whether to exit for an updater to apply updates when requested. | false | false |
188188
| `ECS_DISABLE_METRICS` | <true | false> | Whether to disable metrics gathering for tasks. | false | false |
189+
| `ECS_DISABLE_STORAGE_METRICS` | <true | false> | Whether to disable storage metrics gathering for tasks. | false | false |
190+
| `ECS_DISABLE_NETWORK_METRICS` | <true | false> | Whether to disable network metrics gathering for tasks. | false | false |
189191
| `ECS_POLL_METRICS` | <true | false> | Whether to poll or stream when gathering metrics for tasks. Setting this value to `true` can help reduce the CPU usage of dockerd and containerd on the ECS container instance. See also ECS_POLL_METRICS_WAIT_DURATION for setting the poll interval. | `false` | `false` |
190192
| `ECS_POLLING_METRICS_WAIT_DURATION` | 10s | Time to wait between polling for metrics for a task. Not used when ECS_POLL_METRICS is false. Maximum value is 20s and minimum value is 5s. If user sets above maximum it will be set to max, and if below minimum it will be set to min. As the number of tasks/containers increase, a higher `ECS_POLLING_METRICS_WAIT_DURATION` value can potentially cause a problem where memory reservation value of ECS cluster reported in metrics becomes unstable due to missing metrics sample at metric collection time. It is recommended to keep this value smaller than 18s. This behavior is only observed on certain OS and platforms. | 10s | 10s |
191193
| `ECS_PULL_DEPENDENT_CONTAINERS_UPFRONT` | <true | false> | Whether to pull images for containers with dependencies before the dependsOn condition has been satisfied. | false | false |

agent/config/config.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -541,6 +541,8 @@ func environmentConfig() (Config, error) {
541541
UpdatesEnabled: parseBooleanDefaultFalseConfig("ECS_UPDATES_ENABLED"),
542542
UpdateDownloadDir: os.Getenv("ECS_UPDATE_DOWNLOAD_DIR"),
543543
DisableMetrics: parseBooleanDefaultFalseConfig("ECS_DISABLE_METRICS"),
544+
DisableNetworkMetrics: parseBooleanDefaultFalseConfig("ECS_DISABLE_NETWORK_METRICS"),
545+
DisableStorageMetrics: parseBooleanDefaultFalseConfig("ECS_DISABLE_STORAGE_METRICS"),
544546
ReservedMemory: parseEnvVariableUint16("ECS_RESERVED_MEMORY"),
545547
AvailableLoggingDrivers: parseAvailableLoggingDrivers(),
546548
PrivilegedDisabled: parseBooleanDefaultFalseConfig("ECS_DISABLE_PRIVILEGED"),
@@ -626,6 +628,8 @@ func (cfg *Config) String() string {
626628
"AuthType: %v, "+
627629
"UpdatesEnabled: %v, "+
628630
"DisableMetrics: %v, "+
631+
"DisableNetworkMetrics: %v, "+
632+
"DisableStorageMetrics: %v, "+
629633
"PollMetrics: %v, "+
630634
"PollingMetricsWaitDuration: %v, "+
631635
"ReservedMem: %v, "+
@@ -646,6 +650,8 @@ func (cfg *Config) String() string {
646650
cfg.EngineAuthType,
647651
cfg.UpdatesEnabled,
648652
cfg.DisableMetrics,
653+
cfg.DisableNetworkMetrics,
654+
cfg.DisableStorageMetrics,
649655
cfg.PollMetrics,
650656
cfg.PollingMetricsWaitDuration,
651657
cfg.ReservedMemory,

agent/config/config_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,10 +252,14 @@ func TestConfigBoolean(t *testing.T) {
252252
defer setTestRegion()()
253253
defer setTestEnv("ECS_DISABLE_DOCKER_HEALTH_CHECK", "true")()
254254
defer setTestEnv("ECS_DISABLE_METRICS", "true")()
255+
defer setTestEnv("ECS_DISABLE_NETWORK_METRICS", "true")()
256+
defer setTestEnv("ECS_DISABLE_STORAGE_METRICS", "true")()
255257
defer setTestEnv("ECS_ENABLE_SPOT_INSTANCE_DRAINING", "true")()
256258
cfg, err := NewConfig(ec2.NewBlackholeEC2MetadataClient())
257259
assert.NoError(t, err)
258260
assert.True(t, cfg.DisableMetrics.Enabled())
261+
assert.True(t, cfg.DisableNetworkMetrics.Enabled())
262+
assert.True(t, cfg.DisableStorageMetrics.Enabled())
259263
assert.True(t, cfg.DisableDockerHealthCheck.Enabled())
260264
assert.True(t, cfg.SpotInstanceDrainingEnabled.Enabled())
261265
}

agent/config/config_unix.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ func DefaultConfig() Config {
7777
DataDir: "/data/",
7878
DataDirOnHost: "/var/lib/ecs",
7979
DisableMetrics: BooleanDefaultFalse{Value: ExplicitlyDisabled},
80+
DisableNetworkMetrics: BooleanDefaultFalse{Value: ExplicitlyDisabled},
81+
DisableStorageMetrics: BooleanDefaultFalse{Value: ExplicitlyDisabled},
8082
ReservedMemory: 0,
8183
AvailableLoggingDrivers: []dockerclient.LoggingDriver{dockerclient.JSONFileDriver, dockerclient.NoneDriver},
8284
TaskCleanupWaitDuration: DefaultTaskCleanupWaitDuration,

agent/config/config_unix_test.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ func TestConfigDefault(t *testing.T) {
4141
assert.Equal(t, "unix:///var/run/docker.sock", cfg.DockerEndpoint, "Default docker endpoint set incorrectly")
4242
assert.Equal(t, "/data/", cfg.DataDir, "Default datadir set incorrectly")
4343
assert.False(t, cfg.DisableMetrics.Enabled(), "Default disablemetrics set incorrectly")
44+
assert.False(t, cfg.DisableNetworkMetrics.Enabled(), "Default disablenetworkmetrics set incorrectly")
45+
assert.False(t, cfg.DisableStorageMetrics.Enabled(), "Default disablestoragemetrics set incorrectly")
4446
assert.Equal(t, 5, len(cfg.ReservedPorts), "Default reserved ports set incorrectly")
4547
assert.Equal(t, uint16(0), cfg.ReservedMemory, "Default reserved memory set incorrectly")
4648
assert.Equal(t, 30*time.Second, cfg.DockerStopTimeout, "Default docker stop container timeout set incorrectly")

agent/config/config_windows_test.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@ func TestConfigDefault(t *testing.T) {
4343
assert.Equal(t, "npipe:////./pipe/docker_engine", cfg.DockerEndpoint, "Default docker endpoint set incorrectly")
4444
assert.Equal(t, `C:\ProgramData\Amazon\ECS\data`, cfg.DataDir, "Default datadir set incorrectly")
4545
assert.False(t, cfg.DisableMetrics.Enabled(), "Default disablemetrics set incorrectly")
46+
assert.False(t, cfg.DisableStorageMetrics.Enabled(), "Default disablestoragemetrics set incorrectly")
47+
assert.False(t, cfg.DisableNetworkMetrics.Enabled(), "Default disablenetworkmetrics set incorrectly")
4648
assert.Equal(t, 11, len(cfg.ReservedPorts), "Default reserved ports set incorrectly")
4749
assert.Equal(t, uint16(0), cfg.ReservedMemory, "Default reserved memory set incorrectly")
4850
assert.Equal(t, 30*time.Second, cfg.DockerStopTimeout, "Default docker stop container timeout set incorrectly")

agent/config/types.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,14 @@ type Config struct {
9191
// sent to the ECS telemetry endpoint
9292
DisableMetrics BooleanDefaultFalse
9393

94+
// DisableNetworkMetrics configures whether task network IO utilization metrics should be
95+
// sent to the ECS telemetry endpoint
96+
DisableNetworkMetrics BooleanDefaultFalse
97+
98+
// DisableStorageMetrics configures whether task block IO utilization metrics should be
99+
// sent to the ECS telemetry endpoint
100+
DisableStorageMetrics BooleanDefaultFalse
101+
94102
// PollMetrics configures whether metrics are constantly streamed for each container or
95103
// polled on interval instead.
96104
PollMetrics BooleanDefaultFalse

agent/stats/engine.go

Lines changed: 53 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -864,14 +864,16 @@ func (engine *DockerStatsEngine) taskContainerMetricsUnsafe(taskArn string) ([]*
864864
MemoryStatsSet: memoryStatsSet,
865865
}
866866

867-
storageStatsSet, err := container.statsQueue.GetStorageStatsSet()
868-
if err != nil && age > gracePeriod {
869-
logger.Warn("Error getting storage stats for container", logger.Fields{
870-
field.Container: dockerID,
871-
field.Error: err,
872-
})
873-
} else {
874-
containerMetric.StorageStatsSet = storageStatsSet
867+
if !engine.config.DisableStorageMetrics.Enabled() {
868+
storageStatsSet, err := container.statsQueue.GetStorageStatsSet()
869+
if err != nil && age > gracePeriod {
870+
logger.Warn("Error getting storage stats for container", logger.Fields{
871+
field.Container: dockerID,
872+
field.Error: err,
873+
})
874+
} else {
875+
containerMetric.StorageStatsSet = storageStatsSet
876+
}
875877
}
876878

877879
restartStatsSet, err := container.statsQueue.GetRestartStatsSet()
@@ -887,51 +889,53 @@ func (engine *DockerStatsEngine) taskContainerMetricsUnsafe(taskArn string) ([]*
887889
containerMetric.RestartStatsSet = restartStatsSet
888890
}
889891

890-
task, err := engine.resolver.ResolveTask(dockerID)
891-
if err != nil {
892-
logger.Warn("Task not found for container", logger.Fields{
893-
field.Container: dockerID,
894-
field.Error: err,
895-
})
896-
} else {
897-
if dockerContainer, err := engine.resolver.ResolveContainer(dockerID); err != nil {
898-
logger.Warn("Could not map container ID to container, container", logger.Fields{
899-
field.DockerId: dockerID,
900-
field.Error: err,
892+
if !engine.config.DisableNetworkMetrics.Enabled() {
893+
task, err := engine.resolver.ResolveTask(dockerID)
894+
if err != nil {
895+
logger.Warn("Task not found for container", logger.Fields{
896+
field.Container: dockerID,
897+
field.Error: err,
901898
})
902899
} else {
903-
// send network stats for default/bridge/nat/awsvpc network modes
904-
if task.IsNetworkModeBridge() {
905-
if task.IsServiceConnectEnabled() && dockerContainer.Container.Type == apicontainer.ContainerCNIPause {
906-
seelog.Debug("Skip adding network stats for pause container in Service Connect enabled task")
907-
} else {
908-
networkStatsSet, err := container.statsQueue.GetNetworkStatsSet()
909-
if err != nil && age > gracePeriod {
910-
// we log the error and still continue to publish cpu, memory stats
911-
logger.Warn("Error getting network stats for container", logger.Fields{
912-
field.Container: dockerID,
913-
field.Error: err,
914-
})
900+
if dockerContainer, err := engine.resolver.ResolveContainer(dockerID); err != nil {
901+
logger.Warn("Could not map container ID to container, container", logger.Fields{
902+
field.DockerId: dockerID,
903+
field.Error: err,
904+
})
905+
} else {
906+
// send network stats for default/bridge/nat/awsvpc network modes
907+
if task.IsNetworkModeBridge() {
908+
if task.IsServiceConnectEnabled() && dockerContainer.Container.Type == apicontainer.ContainerCNIPause {
909+
seelog.Debug("Skip adding network stats for pause container in Service Connect enabled task")
915910
} else {
916-
containerMetric.NetworkStatsSet = networkStatsSet
911+
networkStatsSet, err := container.statsQueue.GetNetworkStatsSet()
912+
if err != nil && age > gracePeriod {
913+
// we log the error and still continue to publish cpu, memory stats
914+
logger.Warn("Error getting network stats for container", logger.Fields{
915+
field.Container: dockerID,
916+
field.Error: err,
917+
})
918+
} else {
919+
containerMetric.NetworkStatsSet = networkStatsSet
920+
}
917921
}
918-
}
919-
} else if task.IsNetworkModeAWSVPC() {
920-
taskStatsMap, taskExistsInTaskStats := engine.taskToTaskStats[taskArn]
921-
if !taskExistsInTaskStats {
922-
return nil, fmt.Errorf("task not found")
923-
}
924-
// do not add network stats for pause container
925-
if dockerContainer.Container.Type != apicontainer.ContainerCNIPause {
926-
networkStats, err := taskStatsMap.StatsQueue.GetNetworkStatsSet()
927-
if err != nil && age > gracePeriod {
928-
logger.Warn("Error getting network stats for container", logger.Fields{
929-
field.TaskARN: taskArn,
930-
field.Container: dockerContainer.DockerID,
931-
field.Error: err,
932-
})
933-
} else {
934-
containerMetric.NetworkStatsSet = networkStats
922+
} else if task.IsNetworkModeAWSVPC() {
923+
taskStatsMap, taskExistsInTaskStats := engine.taskToTaskStats[taskArn]
924+
if !taskExistsInTaskStats {
925+
return nil, fmt.Errorf("task not found")
926+
}
927+
// do not add network stats for pause container
928+
if dockerContainer.Container.Type != apicontainer.ContainerCNIPause {
929+
networkStats, err := taskStatsMap.StatsQueue.GetNetworkStatsSet()
930+
if err != nil && age > gracePeriod {
931+
logger.Warn("Error getting network stats for container", logger.Fields{
932+
field.TaskARN: taskArn,
933+
field.Container: dockerContainer.DockerID,
934+
field.Error: err,
935+
})
936+
} else {
937+
containerMetric.NetworkStatsSet = networkStats
938+
}
935939
}
936940
}
937941
}

agent/stats/engine_test.go

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -724,20 +724,22 @@ func TestSynchronizeOnRestart(t *testing.T) {
724724

725725
func TestTaskNetworkStatsSet(t *testing.T) {
726726
var networkModes = []struct {
727-
ENIs []*ni.NetworkInterface
728-
NetworkMode string
729-
ServiceConnectEnabled bool
730-
StatsEmpty bool
727+
ENIs []*ni.NetworkInterface
728+
NetworkMode string
729+
ServiceConnectEnabled bool
730+
NetworkMetricsDisabled bool
731+
StatsEmpty bool
731732
}{
732-
{nil, DefaultNetworkMode, false, false},
733-
{nil, DefaultNetworkMode, true, true},
733+
{nil, DefaultNetworkMode, false, false, false},
734+
{nil, DefaultNetworkMode, true, false, true},
735+
{nil, DefaultNetworkMode, false, true, true},
734736
}
735737
for _, tc := range networkModes {
736-
testNetworkModeStats(t, tc.NetworkMode, tc.ENIs, tc.ServiceConnectEnabled, tc.StatsEmpty)
738+
testNetworkModeStats(t, tc.NetworkMode, tc.ENIs, tc.ServiceConnectEnabled, tc.NetworkMetricsDisabled, tc.StatsEmpty)
737739
}
738740
}
739741

740-
func testNetworkModeStats(t *testing.T, netMode string, enis []*ni.NetworkInterface, serviceConnectEnabled, emptyStats bool) {
742+
func testNetworkModeStats(t *testing.T, netMode string, enis []*ni.NetworkInterface, serviceConnectEnabled, networkMetricsDisabled, emptyStats bool) {
741743
mockCtrl := gomock.NewController(t)
742744
defer mockCtrl.Finish()
743745
resolver := mock_resolver.NewMockContainerMetadataResolver(mockCtrl)
@@ -782,7 +784,18 @@ func testNetworkModeStats(t *testing.T, netMode string, enis []*ni.NetworkInterf
782784
State: &types.ContainerState{Pid: 23},
783785
},
784786
}, nil).AnyTimes()
785-
engine := NewDockerStatsEngine(&cfg, nil, eventStream("TestTaskNetworkStatsSet"), nil, nil, nil)
787+
788+
var c *config.Config
789+
if networkMetricsDisabled {
790+
c = &config.Config{
791+
DisableNetworkMetrics: config.BooleanDefaultFalse{Value: config.ExplicitlyEnabled},
792+
}
793+
c = c.Merge(cfg)
794+
} else {
795+
c = &cfg
796+
}
797+
798+
engine := NewDockerStatsEngine(c, nil, eventStream("TestTaskNetworkStatsSet"), nil, nil, nil)
786799
ctx, cancel := context.WithCancel(context.TODO())
787800
defer cancel()
788801
engine.ctx = ctx

agent/stats/engine_unix_test.go

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -43,17 +43,19 @@ const (
4343

4444
func TestLinuxTaskNetworkStatsSet(t *testing.T) {
4545
var networkModes = []struct {
46-
ENIs []*ni.NetworkInterface
47-
NetworkMode string
48-
StatsEmpty bool
46+
ENIs []*ni.NetworkInterface
47+
NetworkMode string
48+
NetworkMetricsDisabled bool
49+
StatsEmpty bool
4950
}{
50-
{[]*ni.NetworkInterface{{ID: "ec2Id"}}, "awsvpc", true},
51-
{nil, "host", true},
52-
{nil, "bridge", false},
53-
{nil, "none", true},
51+
{[]*ni.NetworkInterface{{ID: "ec2Id"}}, "awsvpc", false, true},
52+
{nil, "host", false, true},
53+
{nil, "bridge", false, false},
54+
{nil, "bridge", true, true},
55+
{nil, "none", false, true},
5456
}
5557
for _, tc := range networkModes {
56-
testNetworkModeStats(t, tc.NetworkMode, tc.ENIs, false, tc.StatsEmpty)
58+
testNetworkModeStats(t, tc.NetworkMode, tc.ENIs, false, tc.NetworkMetricsDisabled, tc.StatsEmpty)
5759
}
5860
}
5961

0 commit comments

Comments
 (0)