So I've built a connector to one of my API's and i...
# singer-tap-development
i
So I've built a connector to one of my API's and initially I used the date range for pagination (code takes the fromDate from the previous run, adds one second to it, and then queries a new 2-week date range). However, the API also allows for pagination with a pageNumber and pageSize parameters. Has anyone managed a scenario where you had to use both of these? sometimes the response is too big for meltano to process so I'd like to chunk it into 10,000 records at a time. I'm a bit confused as to how I should implement this since the paginator class only deals with the date range. Here's what my custom paginator class looks like:
Copy code
class JobDivaPaginator(BaseAPIPaginator):
    def __init__(self, *args, **kwargs):
        super().__init__(None, *args, **kwargs)

    def has_more(self, response):
        #check the get_next() response to make sure it's before today
        return self.get_next(response) < date.today()

    def get_next(self, response):
        #get the parameters used for the previous request
        #return ["toDate"] param from the previous request and add 1 second to determine the new ["fromDate"]
        params = dict(parse_qsl(urlparse(response.request.url).query))
        
        return datetime.strptime(params["toDate"], OUTPUT_DATE_FORMAT).date() + timedelta(seconds=1)
and my get_url_params:
Copy code
def get_url_params(
        self,
        context: dict | None,  # noqa: ARG002
        next_page_token: date | None,  # noqa: ANN401
    ) -> dict[str, Any]:
        
        #start_value = self.config["start_date"] 
        start_value = self.get_starting_replication_key_value(context)
        from_date = (
        next_page_token
        or datetime.strptime(start_value, INPUT_DATE_FORMAT).date()
        )
        to_date = from_date + timedelta(days=14) - timedelta(seconds=1)

        return {
            "fromDate": from_date.strftime(OUTPUT_DATE_FORMAT),
            "toDate": to_date.strftime(OUTPUT_DATE_FORMAT),
        }
1
e
The example in m https://github.com/meltano/sdk/issues/2318 might be helpful
i
Copy code
class PageValue(t.NamedTuple):

    start_date: datetime.date
    page_number: int

class JobDivaPaginator(BaseAPIPaginator[PageValue]):
    def __init__(self, *args, **kwargs):
        super().__init__(None, *args, **kwargs)

    def has_more(self, response):
        #check the get_next() response to make sure it's before today
        return self.get_next(response.next_start_date) < date.today()

    def get_next(self, response):
        #get the parameters used for the previous request
        #return ["toDate"] param from the previous request and add 1 second to determine the new ["fromDate"]
        params = dict(parse_qsl(urlparse(response.request.url).query))
        curr_request = yield from extract_jsonpath(self.records_jsonpath, input=response.json())

        if len(curr_request) < 1:
            next_start_date = datetime.strptime(params["toDate"], OUTPUT_DATE_FORMAT).date() + timedelta(seconds=1)
            page_number = 1
            return PageValue(next_start_date, page_number)
        else:
            next_start_date = datetime.strptime(params["toDate"], OUTPUT_DATE_FORMAT).date() 
            page_number = params["pageNumber"] + 1
            return PageValue(next_start_date, page_number)    


def get_url_params(
        self,
        context: dict | None,  # noqa: ARG002
        next_page_token,  # noqa: ANN401
    ) -> dict[str, Any]:
        
        #start_value = self.config["start_date"] 
        start_value = self.get_starting_replication_key_value(context)
        from_date = (
        next_page_token.start_date
        or datetime.strptime(start_value, INPUT_DATE_FORMAT).date()
        )
        to_date = from_date + timedelta(days=14) - timedelta(seconds=1)

        page_number = next_page_token.page_number

        return {
            "fromDate": from_date.strftime(OUTPUT_DATE_FORMAT),
            "toDate": to_date.strftime(OUTPUT_DATE_FORMAT),
            "pageNumber": page_number,
            "pageSize": 1000,
        }
Here's what I did roughly. In the stack trace, though, I'm getting
AttributeError: 'NoneType' object has no attribute 'start_date'
from the next_page_token.start_date. Does the get_next() method return the next_page_token object for the get_url_params() method in the stream class?
r
It's probably
None
on the first request, before any call to your paginator
get_next
.
1
e
yeah I'd try to add a guard to check if
next_page_token
is
None
instead of checking if
next_page_token.start_date
is truthy
i
Copy code
def get_url_params(
        self,
        context: dict | None,  # noqa: ARG002
        next_page_token: Optional[PageValue] = None,  # noqa: ANN401
    ) -> dict[str, Any]:
        
        start_value = self.get_starting_replication_key_value(context)

        if next_page_token is None:
            next_page_token = PageValue(start_value, 1)

        from_date = (
            next_page_token.start_date
            or datetime.strptime(start_value, INPUT_DATE_FORMAT).date()
            )
        print("from_date:", from_date, "type:", type(from_date))

        to_date = from_date + timedelta(days=14) - timedelta(seconds=1)
        page_number = next_page_token.page_number
Yeah so I did this to fix it. Now working on why from date is set to a string from the print statement. 'from_date: 2023-11-01T000000 type: <class 'str'>' from the print().
e
these lines here
Copy code
start_value = self.get_starting_replication_key_value(context)

        if next_page_token is None:
            next_page_token = PageValue(start_value, 1)
make it a string I think. You probably want to apply
strptime(start_value)
earlier.
🙌 1
i
ahhhhh its always the obvious thing hahaha
thank you
e
np!