Unverified Commit 384cba12 authored by Goutham Veeramachaneni's avatar Goutham Veeramachaneni Committed by GitHub
Browse files

Add flag for size based retention (#5109)

* Add flag for size based retention
Signed-off-by: default avatarGoutham Veeramachaneni <gouthamve@gmail.com>

* Deprecate the old retention flag for a new one.
Signed-off-by: default avatarGoutham Veeramachaneni <gouthamve@gmail.com>

* Add ability to take a suffix for size flag
Signed-off-by: default avatarGoutham Veeramachaneni <gouthamve@gmail.com>

* Address feedback
Signed-off-by: default avatarGoutham Veeramachaneni <gouthamve@gmail.com>
parent 3bd41cc9
......@@ -71,10 +71,19 @@ var (
Name: "prometheus_config_last_reload_success_timestamp_seconds",
Help: "Timestamp of the last successful configuration reload.",
defaultRetentionString = "15d"
defaultRetentionDuration model.Duration
func init() {
var err error
defaultRetentionDuration, err = model.ParseDuration(defaultRetentionString)
if err != nil {
func main() {
......@@ -83,6 +92,11 @@ func main() {
var (
oldFlagRetentionDuration model.Duration
newFlagRetentionDuration model.Duration
cfg := struct {
configFile string
......@@ -171,8 +185,14 @@ func main() {
"Size at which to split the tsdb WAL segment files (e.g. 100MB)").
a.Flag("storage.tsdb.retention", "How long to retain samples in storage.").
a.Flag("storage.tsdb.retention", "[DEPRECATED] How long to retain samples in storage. This flag has been deprecated, use \"storage.tsdb.retention.time\" instead").
a.Flag("storage.tsdb.retention.time", "How long to retain samples in storage. Overrides \"storage.tsdb.retention\" if this flag is set to anything other than default.").
a.Flag("storage.tsdb.retention.size", "[EXPERIMENTAL] Maximum number of bytes that can be stored for blocks. Units supported: KB, MB, GB, TB, PB. This flag is experimental and can be changed in future releases.").
a.Flag("storage.tsdb.no-lockfile", "Do not create lockfile in data directory.").
......@@ -244,8 +264,10 @@ func main() {
// RoutePrefix must always be at least '/'.
cfg.web.RoutePrefix = "/" + strings.Trim(cfg.web.RoutePrefix, "/")
cfg.tsdb.RetentionDuration = chooseRetention(oldFlagRetentionDuration, newFlagRetentionDuration)
if cfg.tsdb.MaxBlockDuration == 0 {
cfg.tsdb.MaxBlockDuration = cfg.tsdb.Retention / 10
cfg.tsdb.MaxBlockDuration = cfg.tsdb.RetentionDuration / 10
promql.LookbackDelta = time.Duration(cfg.lookbackDelta)
......@@ -253,6 +275,10 @@ func main() {
logger := promlog.New(&cfg.promlogConfig)
if oldFlagRetentionDuration != defaultRetentionDuration {
level.Warn(logger).Log("deprecation_notice", `"storage.tsdb.retention" flag is deprecated use "storage.tsdb.retention.time" instead.`)
// Above level 6, the k8s client would log bearer tokens in clear-text.
klog.SetLogger(log.With(logger, "component", "k8s_client_runtime"))
......@@ -757,3 +783,19 @@ func sendAlerts(s sender, externalURL string) rules.NotifyFunc {
// chooseRetention is some roundabout code to support both RetentionDuration and Retention (for different flags).
// If Retention is 15d, then it means that the default value is set and the value of RetentionDuration is used.
func chooseRetention(oldFlagDuration, newFlagDuration model.Duration) model.Duration {
retention := oldFlagDuration
if retention == defaultRetentionDuration {
retention = newFlagDuration
// Further newFlag takes precedence if it's set to anything other than default.
if newFlagDuration != defaultRetentionDuration {
retention = newFlagDuration
return retention
......@@ -25,6 +25,7 @@ import (
......@@ -284,3 +285,31 @@ func TestWALSegmentSizeBounds(t *testing.T) {
func TestChooseRetention(t *testing.T) {
retention1, err := model.ParseDuration("20d")
testutil.Ok(t, err)
retention2, err := model.ParseDuration("30d")
testutil.Ok(t, err)
cases := []struct {
oldFlagRetention model.Duration
newFlagRetention model.Duration
chosen model.Duration
// Both are default (unset flags).
{defaultRetentionDuration, defaultRetentionDuration, defaultRetentionDuration},
// Old flag is set and new flag is unset.
{retention1, defaultRetentionDuration, retention1},
// Old flag is unset and new flag is set.
{defaultRetentionDuration, retention2, retention2},
// Both flags are set.
{retention1, retention2, retention2},
for _, tc := range cases {
retention := chooseRetention(tc.oldFlagRetention, tc.newFlagRetention)
testutil.Equals(t, tc.chosen, retention)
......@@ -52,7 +52,9 @@ For further details on file format, see [TSDB format](https://github.com/prometh
Prometheus has several flags that allow configuring the local storage. The most important ones are:
* `--storage.tsdb.path`: This determines where Prometheus writes its database. Defaults to `data/`.
* `--storage.tsdb.retention`: This determines when to remove old data. Defaults to `15d`.
* `--storage.tsdb.retention.time`: This determines when to remove old data. Defaults to `15d`. Overrides `storage.tsdb.retention` if this flag is set to anything other than default.
* `--storage.tsdb.retention.size`: [EXPERIMENTAL] This determines the maximum number of bytes that storage blocks can use (note that this does not include the WAL size, which can be substantial). The oldest data will be removed first. Defaults to `0` or disabled. This flag is experimental and can be changed in future releases. Units supported: KB, MB, GB, PB. Ex: "512MB"
* `--storage.tsdb.retention`: This flag has been deprecated in favour of `storage.tsdb.retention.time`.
On average, Prometheus uses only around 1-2 bytes per sample. Thus, to plan the capacity of a Prometheus server, you can use the rough formula:
......@@ -64,6 +66,8 @@ To tune the rate of ingested samples per second, you can either reduce the numbe
If your local storage becomes corrupted for whatever reason, your best bet is to shut down Prometheus and remove the entire storage directory. However, you can also try removing individual block directories to resolve the problem. This means losing a time window of around two hours worth of data per block directory. Again, Prometheus's local storage is not meant as durable long-term storage.
If both time and size retention policies are specified, whichever policy triggers first will be used at that instant.
## Remote storage integrations
Prometheus's local storage is limited by single nodes in its scalability and durability. Instead of trying to solve clustered storage in Prometheus itself, Prometheus has a set of interfaces that allow integrating with remote storage systems.
......@@ -119,7 +119,10 @@ type Options struct {
WALSegmentSize units.Base2Bytes
// Duration for how long to retain data.
Retention model.Duration
RetentionDuration model.Duration
// Maximum number of bytes to be retained.
MaxBytes units.Base2Bytes
// Disable creation and consideration of lockfile.
NoLockfile bool
......@@ -183,7 +186,8 @@ func Open(path string, l log.Logger, r prometheus.Registerer, opts *Options) (*t
db, err := tsdb.Open(path, l, r, &tsdb.Options{
WALSegmentSize: int(opts.WALSegmentSize),
RetentionDuration: uint64(time.Duration(opts.Retention).Seconds() * 1000),
RetentionDuration: uint64(time.Duration(opts.RetentionDuration).Seconds() * 1000),
MaxBytes: int64(opts.MaxBytes),
BlockRanges: rngs,
NoLockfile: opts.NoLockfile,
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment