Upstream targets not taken out of pool when upstream tests fail

Hi,
We’re having some trouble getting uptime tests to cooperate with load balancing on our api’s. I have set it up as described in the following docs, but it doesn’t seem to work as expected. When a health check fails, it is registering fail events, but the related upstream target is not taken out of the load balancer pool.

https://tyk.io/tyk-documentation/ensure-high-availability/uptime-tests/

Strangely, We don’t see any option to associate an upstream target with a heath check URL, but we’re assuming you are using the domain / port combination to create the mapping. Is this correct??

A sample configuration is below:

== Core Settings ==
Upstream targets:

[x] http://subdomainA.domain.com:8890
[x] http://subdomainB.domain.com:8890

== Uptime Tests ==

Check host against downtime list (enabled)
Use a discovery service (disabled)

Check URLs

GET http://subdomainA.domain.com:8890/v1/health
GET http://subdomainB.domain.com:8890/v1/health

We are running version:
tyk-gateway.x86_64 2.3.0.33-1 @DEC16-tyktyk-6-x86_64

Hi Jason,

Are you able to provide the output of the failing health check at all? This might help us to troubleshoot the issue.

Also, it looks like the version of Tyk that you’re using is quite old and we do encourage our users keep up-to-date versions of the software in case they run into any of the bugs that we’ve patched in the past. It may be worth upgrading before attempting to run this health check again (the latest version of Tyk can be downloaded from this link).

Kind regards,
Jess @ Tyk

This is correct, health check associations are to the domain:port.

I’ve attached an API definition which has just been tested with the LB and upstream tests (alternating taking one or the other host offline), you might want to compare it to yours:

{
    "id": "58c377625b03d301636e4342",
    "name": "lbus",
    "slug": "lbus",
    "api_id": "ac85babe7797439a7b426dc5ad22a5ba",
    "org_id": "589a2058c2e9f9018bad03aa",
    "use_keyless": true,
    "use_oauth2": false,
    "use_openid": false,
    "openid_options": {
        "providers": [],
        "segregate_by_client": false
    },
    "oauth_meta": {
        "allowed_access_types": [],
        "allowed_authorize_types": [],
        "auth_login_redirect": ""
    },
    "auth": {
        "use_param": false,
        "param_name": "",
        "use_cookie": false,
        "cookie_name": "",
        "auth_header_name": ""
    },
    "use_basic_auth": false,
    "enable_jwt": false,
    "use_standard_auth": false,
    "enable_coprocess_auth": false,
    "jwt_signing_method": "",
    "jwt_source": "",
    "jwt_identity_base_field": "",
    "jwt_client_base_field": "",
    "jwt_policy_field_name": "",
    "notifications": {
        "shared_secret": "",
        "oauth_on_keychange_url": ""
    },
    "enable_signature_checking": false,
    "hmac_allowed_clock_skew": -1,
    "base_identity_provided_by": "",
    "definition": {
        "location": "header",
        "key": "x-api-version"
    },
    "version_data": {
        "not_versioned": true,
        "versions": {
            "Default": {
                "name": "Default",
                "expires": "",
                "paths": {
                    "ignored": [],
                    "white_list": [],
                    "black_list": []
                },
                "use_extended_paths": true,
                "extended_paths": {},
                "global_headers": {},
                "global_headers_remove": [],
                "global_size_limit": 0,
                "override_target": ""
            }
        }
    },
    "uptime_tests": {
        "check_list": [
            {
                "url": "http://tyk-httpbin:8000/get",
                "method": "GET",
                "headers": {},
                "body": ""
            },
            {
                "url": "http://tyk-httpbin2:8000/get",
                "method": "GET",
                "headers": {},
                "body": ""
            }
        ],
        "config": {
            "expire_utime_after": 0,
            "service_discovery": {
                "use_discovery_service": false,
                "query_endpoint": "",
                "use_nested_query": false,
                "parent_data_path": "",
                "data_path": "",
                "port_data_path": "",
                "target_path": "",
                "use_target_list": false,
                "cache_timeout": 60,
                "endpoint_returns_list": false
            },
            "recheck_wait": 0
        }
    },
    "proxy": {
        "preserve_host_header": false,
        "listen_path": "/lbus/",
        "target_url": "",
        "strip_listen_path": true,
        "enable_load_balancing": true,
        "target_list": [
            "http://tyk-httpbin:8000",
            "http://tyk-httpbin2:8000"
        ],
        "check_host_against_uptime_tests": true,
        "service_discovery": {
            "use_discovery_service": false,
            "query_endpoint": "",
            "use_nested_query": false,
            "parent_data_path": "",
            "data_path": "hostname",
            "port_data_path": "port",
            "target_path": "/api-slug",
            "use_target_list": false,
            "cache_timeout": 60,
            "endpoint_returns_list": false
        }
    },
    "disable_rate_limit": false,
    "disable_quota": false,
    "custom_middleware": {
        "pre": [],
        "post": [],
        "post_key_auth": [],
        "auth_check": {
            "name": "",
            "path": "",
            "require_session": false
        },
        "response": [],
        "driver": "",
        "id_extractor": {
            "extract_from": "",
            "extract_with": "",
            "extractor_config": {}
        }
    },
    "custom_middleware_bundle": "",
    "cache_options": {
        "cache_timeout": 60,
        "enable_cache": true,
        "cache_all_safe_requests": false,
        "cache_response_codes": [],
        "enable_upstream_cache_control": false
    },
    "session_lifetime": 0,
    "active": true,
    "auth_provider": {
        "name": "",
        "storage_engine": "",
        "meta": {}
    },
    "session_provider": {
        "name": "",
        "storage_engine": "",
        "meta": null
    },
    "event_handlers": {
        "events": {}
    },
    "enable_batch_request_support": false,
    "enable_ip_whitelisting": false,
    "allowed_ips": [],
    "dont_set_quota_on_create": false,
    "expire_analytics_after": 0,
    "response_processors": [],
    "CORS": {
        "enable": false,
        "allowed_origins": [],
        "allowed_methods": [],
        "allowed_headers": [],
        "exposed_headers": [],
        "allow_credentials": false,
        "max_age": 24,
        "options_passthrough": false,
        "debug": false
    },
    "domain": "",
    "do_not_track": false,
    "tags": [],
    "enable_context_vars": false
}

This is the output you should expect:

[Mar 11 04:22:00]  WARN host-check-mgr: [HOST CHECKER MANAGER] Host is DOWN: http://tyk-httpbin2:8000/get
[Mar 11 04:22:02]  WARN host-check-mgr: [HOST CHECKER MANAGER] Host is DOWN: http://tyk-httpbin2:8000/get
[Mar 11 04:22:04]  WARN host-check-mgr: [HOST CHECKER MANAGER] Host is DOWN: http://tyk-httpbin2:8000/get
[Mar 11 04:22:06]  WARN host-check-mgr: [HOST CHECKER MANAGER] Host is DOWN: http://tyk-httpbin2:8000/get

The hosts here are tyk-httpbin1 and tyk-httpbin2, these are containers we run locally to test this kind of thing, you could replace the hostnames and tests with yours though…

M

Hi Martin,
Thanks for getting back to me. I can confirm that our config is the same as yours. At this point we’ll try upgrading and see if that fixes it. I’ll get back to you with results.

Cheers

1 Like

Hi Martin, we have upgraded to v2.3.3, and we’re still seeing the problem as described above.

The gateway logs are showing:
time=“Mar 21 13:45:40” level=warning msg=“[HOST CHECKER MANAGER] Host is DOWN: http://subdomainB:8890/v1/health

And when requests are made:
time=“Mar 21 13:44:28” level=error msg=“http: proxy error: dial tcp xxx.xxx.xxx.xxx:8890: getsockopt: connection refused” api_id=xxxxx org_id=xxxxxx server_name=“subdomainB:8890” user_id= user_ip=172.28.18.180 user_name=
time=“Mar 21 13:44:28” level=error msg=“request error: There was a problem proxying the request” api_id=xxxxx org_id=xxxxx path=“/v1/apixxxx/” server_name= user_id= user_ip=“172.28.18.180, 172.28.18.180”

Node A is active and serving requests.
Node B is not active, but the gateway is still trying to send traffic to it.

Did you use the api definition that we provided?

Hi Martin, I just did a diff between your config and ours, and the only differences are the api name/slug/hosts, and that we are using versioning - everything else looks the same.

Thanks for confirming - let us verify it as it might be a bug with versions apis.

How are the apis versioned? Header, querystring or URL?

We’re using First URL element

1 Like

Thanks for letting us know - we’all investigate shortly

I turned off versioning and there’s no change.

That’s strange - we managed to replicate the issue when versioning enabled - and the test definition above with httpbin works.

We’ve got it in our backlog as an issue:

Hello! I believe we fixed the issue Fix Uptime tests reverse logic by buger · Pull Request #635 · TykTechnologies/tyk · GitHub

So basic idea that it did not properly work with "failure_trigger_sample_size": 1 in tyk.conf
It continues sending traffic to the host until next health check.

We did not decide yet if it will be included into next patch release (and if there will be more patch releases), or in the upcoming 2.4 release.

Thank you!

Great news. Thanks for letting us know Leon. Do you have an eta on release?

Most likely it will we a new patch release, in the next 2 weeks. We just made a new patch release yesterday, so need some time, before doing a new one to gather more bugfixes.

So I have a similar problem, but my “failure_trigger_sample_size”: is 2.
I’m load balancing between three nodes, wiwnd331-333.
If I shut down one of the nodes (333), this is the gateway log:

[HOST CHECKER MANAGER] Host is DOWN: http://wiwnd333.net:19081/HealthCheckEndpoint.Application/HealthCheckEndpoint/Health
http: proxy error: read tcp connection reset by peer" server_name=“wiwnd333.net:19081

I have an own API Definition that should do the health checks for all my other APIs that also connect to those three nodes:

{
    "id": "xx",
    "name": "Health Check",
    "slug": "health-check",
    "api_id": "xx",
    "org_id": "xx",
    "use_keyless": true,
    "use_oauth2": false,
    "use_openid": false,
    "openid_options": {
        "providers": [],
        "segregate_by_client": false
    },
    "oauth_meta": {
        "allowed_access_types": [],
        "allowed_authorize_types": [],
        "auth_login_redirect": ""
    },
    "auth": {
        "use_param": false,
        "param_name": "",
        "use_cookie": false,
        "cookie_name": "",
        "auth_header_name": ""
    },
    "use_basic_auth": false,
    "enable_jwt": false,
    "use_standard_auth": false,
    "enable_coprocess_auth": false,
    "jwt_signing_method": "",
    "jwt_source": "",
    "jwt_identity_base_field": "",
    "jwt_client_base_field": "",
    "jwt_policy_field_name": "",
    "notifications": {
        "shared_secret": "",
        "oauth_on_keychange_url": ""
    },
    "enable_signature_checking": false,
    "hmac_allowed_clock_skew": -1,
    "base_identity_provided_by": "",
    "definition": {
        "location": "header",
        "key": "x-api-version"
    },
    "version_data": {
        "not_versioned": true,
        "versions": {
            "Default": {
                "name": "Default",
                "expires": "",
                "paths": {
                    "ignored": [],
                    "white_list": [],
                    "black_list": []
                },
                "use_extended_paths": true,
                "extended_paths": {},
                "global_headers": {},
                "global_headers_remove": [],
                "global_size_limit": 0,
                "override_target": ""
            }
        }
    },
    "uptime_tests": {
        "check_list": [
            {
                "url": "http://wiwnd331.net:19081/HealthCheckEndpoint.Application/HealthCheckEndpoint/Health",
                "method": "GET",
                "headers": {},
                "body": ""
            },
            {
                "url": "http://wiwnd332.net:19081/HealthCheckEndpoint.Application/HealthCheckEndpoint/Health",
                "method": "GET",
                "headers": {},
                "body": ""
            },
            {
                "url": "http://wiwnd333.net:19081/HealthCheckEndpoint.Application/HealthCheckEndpoint/Health",
                "method": "GET",
                "headers": {},
                "body": ""
            }
        ],
        "config": {
            "expire_utime_after": 0,
            "service_discovery": {
                "use_discovery_service": false,
                "query_endpoint": "",
                "use_nested_query": false,
                "parent_data_path": "",
                "data_path": "",
                "port_data_path": "",
                "target_path": "",
                "use_target_list": false,
                "cache_timeout": 60,
                "endpoint_returns_list": false
            },
            "recheck_wait": 0
        }
    },
    "proxy": {
        "preserve_host_header": false,
        "listen_path": "/health-check/",
        "target_url": "/",
        "strip_listen_path": true,
        "enable_load_balancing": false,
        "target_list": [],
        "check_host_against_uptime_tests": true,
        "service_discovery": {
            "use_discovery_service": false,
            "query_endpoint": "",
            "use_nested_query": false,
            "parent_data_path": "",
            "data_path": "hostname",
            "port_data_path": "port",
            "target_path": "/api-slug",
            "use_target_list": false,
            "cache_timeout": 60,
            "endpoint_returns_list": false
        }
    },
    "disable_rate_limit": false,
    "disable_quota": false,
    "custom_middleware": {
        "pre": [],
        "post": [],
        "post_key_auth": [],
        "auth_check": {
            "name": "",
            "path": "",
            "require_session": false
        },
        "response": [],
        "driver": "",
        "id_extractor": {
            "extract_from": "",
            "extract_with": "",
            "extractor_config": {}
        }
    },
    "custom_middleware_bundle": "",
    "cache_options": {
        "cache_timeout": 60,
        "enable_cache": true,
        "cache_all_safe_requests": false,
        "cache_response_codes": [],
        "enable_upstream_cache_control": false
    },
    "session_lifetime": 0,
    "active": true,
    "auth_provider": {
        "name": "",
        "storage_engine": "",
        "meta": {}
    },
    "session_provider": {
        "name": "",
        "storage_engine": "",
        "meta": null
    },
    "event_handlers": {
        "events": {}
    },
    "enable_batch_request_support": false,
    "enable_ip_whitelisting": false,
    "allowed_ips": [],
    "dont_set_quota_on_create": false,
    "expire_analytics_after": 0,
    "response_processors": [],
    "CORS": {
        "enable": false,
        "allowed_origins": [],
        "allowed_methods": [],
        "allowed_headers": [],
        "exposed_headers": [],
        "allow_credentials": false,
        "max_age": 24,
        "options_passthrough": false,
        "debug": false
    },
    "domain": "xx",
    "do_not_track": false,
    "tags": [],
    "enable_context_vars": false
}

The problem: every 3rd request runs into “There was a problem proxying the request”.

After the machine comes up again, the gateway notices it:
[HOST CHECKER MANAGER] Host is UP: http://wiwnd333.net:19081/HealthCheckEndpoint.Application/HealthCheckEndpoint/Health

And this is the API I try to call:

{
    "id": "xx",
    "name": "DealerSearch",
    "slug": "dealersearch",
    "api_id": "xx",
    "org_id": "xx",
    "use_keyless": false,
    "use_oauth2": false,
    "use_openid": false,
    "openid_options": {
        "providers": [],
        "segregate_by_client": false
    },
    "oauth_meta": {
        "allowed_access_types": [],
        "allowed_authorize_types": [],
        "auth_login_redirect": ""
    },
    "auth": {
        "use_param": false,
        "param_name": "",
        "use_cookie": false,
        "cookie_name": "",
        "auth_header_name": "X-ApiKey"
    },
    "use_basic_auth": false,
    "enable_jwt": false,
    "use_standard_auth": true,
    "enable_coprocess_auth": false,
    "jwt_signing_method": "",
    "jwt_source": "",
    "jwt_identity_base_field": "",
    "jwt_client_base_field": "",
    "jwt_policy_field_name": "",
    "notifications": {
        "shared_secret": "",
        "oauth_on_keychange_url": ""
    },
    "enable_signature_checking": false,
    "hmac_allowed_clock_skew": -1,
    "base_identity_provided_by": "",
    "definition": {
        "location": "header",
        "key": "Accept"
    },
    "version_data": {
        "not_versioned": true,
        "versions": {
            "application/vnd.x.v1+json": {
                "name": "application/vnd.x.v1+json",
                "expires": "",
                "paths": {
                    "ignored": [],
                    "white_list": [],
                    "black_list": []
                },
                "use_extended_paths": true,
                "extended_paths": {},
                "global_headers": {},
                "global_headers_remove": [],
                "global_size_limit": 0,
                "override_target": ""
            }
        }
    },
    "uptime_tests": {
        "check_list": [],
        "config": {
            "expire_utime_after": 0,
            "service_discovery": {
                "use_discovery_service": false,
                "query_endpoint": "xxx",
                "use_nested_query": false,
                "parent_data_path": "",
                "data_path": "tests",
                "port_data_path": "",
                "target_path": "",
                "use_target_list": false,
                "cache_timeout": 60,
                "endpoint_returns_list": false
            },
            "recheck_wait": 0
        }
    },
    "proxy": {
        "preserve_host_header": false,
        "listen_path": "/dealersearch/",
        "target_url": "",
        "strip_listen_path": true,
        "enable_load_balancing": true,
        "target_list": [
            "http://wiwnd331.net:19081/DealerApplication/DealerService/",
            "http://wiwnd332.net:19081/DealerApplication/DealerService/",
            "http://wiwnd333.net:19081/DealerApplication/DealerService/"
        ],
        "check_host_against_uptime_tests": true,
        "service_discovery": {
            "use_discovery_service": false,
            "query_endpoint": "xxx",
            "use_nested_query": false,
            "parent_data_path": "",
            "data_path": "Host",
            "port_data_path": "Port",
            "target_path": "",
            "use_target_list": false,
            "cache_timeout": 60,
            "endpoint_returns_list": false
        }
    },
    "disable_rate_limit": false,
    "disable_quota": false,
    "custom_middleware": {
        "pre": [],
        "post": [],
        "post_key_auth": [],
        "auth_check": {
            "name": "",
            "path": "",
            "require_session": false
        },
        "response": [],
        "driver": "",
        "id_extractor": {
            "extract_from": "",
            "extract_with": "",
            "extractor_config": {}
        }
    },
    "custom_middleware_bundle": "",
    "cache_options": {
        "cache_timeout": 60,
        "enable_cache": true,
        "cache_all_safe_requests": false,
        "cache_response_codes": [
            200
        ],
        "enable_upstream_cache_control": false
    },
    "session_lifetime": 0,
    "active": true,
    "auth_provider": {
        "name": "",
        "storage_engine": "",
        "meta": {}
    },
    "session_provider": {
        "name": "",
        "storage_engine": "",
        "meta": null
    },
    "event_handlers": {
        "events": {}
    },
    "enable_batch_request_support": false,
    "enable_ip_whitelisting": false,
    "allowed_ips": [],
    "dont_set_quota_on_create": false,
    "expire_analytics_after": 0,
    "response_processors": [],
    "CORS": {
        "enable": true,
        "allowed_origins": [
            "xxx"
        ],
        "allowed_methods": [],
        "allowed_headers": [
            "xxx"
        ],
        "exposed_headers": [],
        "allow_credentials": false,
        "max_age": 24,
        "options_passthrough": false,
        "debug": false
    },
    "domain": "xxx",
    "do_not_track": false,
    "tags": [],
    "enable_context_vars": false
}

Sorry for the wall of text, but I try this since months but wasn’t able to get it to work since …

This is part of my tyk.conf… Timeout is 60 seconds and 2 samples with 30 seconds is allso 60 seconds… this may be a problem :expressionless: ?

"health_checks": {
        "enable_health_checks": true,
        "health_check_value_timeouts": 60
    },
       "uptime_tests": {
       "disable": false,
       "config": {
        "enable_uptime_analytics": true,
        "failure_trigger_sample_size": 2,
        "time_wait": 30,
        "checker_pool_size": 20
       }
    },

We’re aware of the issue - we can only replicate it with a sample size of one, but it will be fixed in a future release

Hello!

Just to clarify, you get this errors even after few minutes, when it should reach all the limits?
Note that current uptime logic assume that failure count should be config_value + 1 (this is also fixed in master). This means that if you have time_wait 30 seconds, and failure_trigger_sample_size is 2, your host should be down for at least 90 seconds, to make health check work.

Regarding your timeout settings, health_check_value_timeouts have no direct connection with uptime tests time_wait.

Leon, Tyk Team