| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576 |
- # Copyright 2018 DPR Authors, The Hugging Face Team.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- """PyTorch DPR model for Open Domain Question Answering."""
- from dataclasses import dataclass
- import torch
- from torch import Tensor, nn
- from ...modeling_outputs import BaseModelOutputWithPooling
- from ...modeling_utils import PreTrainedModel
- from ...utils import (
- ModelOutput,
- auto_docstring,
- logging,
- )
- from ..bert.modeling_bert import BertModel
- from .configuration_dpr import DPRConfig
- logger = logging.get_logger(__name__)
- ##########
- # Outputs
- ##########
- @dataclass
- @auto_docstring(
- custom_intro="""
- Class for outputs of [`DPRQuestionEncoder`].
- """
- )
- class DPRContextEncoderOutput(ModelOutput):
- r"""
- pooler_output (`torch.FloatTensor` of shape `(batch_size, embeddings_size)`):
- The DPR encoder outputs the *pooler_output* that corresponds to the context representation. Last layer
- hidden-state of the first token of the sequence (classification token) further processed by a Linear layer.
- This output is to be used to embed contexts for nearest neighbors queries with questions embeddings.
- """
- pooler_output: torch.FloatTensor
- hidden_states: tuple[torch.FloatTensor, ...] | None = None
- attentions: tuple[torch.FloatTensor, ...] | None = None
- @dataclass
- @auto_docstring(
- custom_intro="""
- Class for outputs of [`DPRQuestionEncoder`].
- """
- )
- class DPRQuestionEncoderOutput(ModelOutput):
- r"""
- pooler_output (`torch.FloatTensor` of shape `(batch_size, embeddings_size)`):
- The DPR encoder outputs the *pooler_output* that corresponds to the question representation. Last layer
- hidden-state of the first token of the sequence (classification token) further processed by a Linear layer.
- This output is to be used to embed questions for nearest neighbors queries with context embeddings.
- """
- pooler_output: torch.FloatTensor
- hidden_states: tuple[torch.FloatTensor, ...] | None = None
- attentions: tuple[torch.FloatTensor, ...] | None = None
- @dataclass
- @auto_docstring(
- custom_intro="""
- Class for outputs of [`DPRQuestionEncoder`].
- """
- )
- class DPRReaderOutput(ModelOutput):
- r"""
- start_logits (`torch.FloatTensor` of shape `(n_passages, sequence_length)`):
- Logits of the start index of the span for each passage.
- end_logits (`torch.FloatTensor` of shape `(n_passages, sequence_length)`):
- Logits of the end index of the span for each passage.
- relevance_logits (`torch.FloatTensor` of shape `(n_passages, )`):
- Outputs of the QA classifier of the DPRReader that corresponds to the scores of each passage to answer the
- question, compared to all the other passages.
- """
- start_logits: torch.FloatTensor
- end_logits: torch.FloatTensor | None = None
- relevance_logits: torch.FloatTensor | None = None
- hidden_states: tuple[torch.FloatTensor, ...] | None = None
- attentions: tuple[torch.FloatTensor, ...] | None = None
- @auto_docstring
- class DPRPreTrainedModel(PreTrainedModel):
- _supports_sdpa = True
- class DPREncoder(DPRPreTrainedModel):
- base_model_prefix = "bert_model"
- def __init__(self, config: DPRConfig):
- super().__init__(config)
- self.bert_model = BertModel(config, add_pooling_layer=False)
- if self.bert_model.config.hidden_size <= 0:
- raise ValueError("Encoder hidden_size can't be zero")
- self.projection_dim = config.projection_dim
- if self.projection_dim > 0:
- self.encode_proj = nn.Linear(self.bert_model.config.hidden_size, config.projection_dim)
- # Initialize weights and apply final processing
- self.post_init()
- def forward(
- self,
- input_ids: Tensor,
- attention_mask: Tensor | None = None,
- token_type_ids: Tensor | None = None,
- inputs_embeds: Tensor | None = None,
- output_attentions: bool = False,
- output_hidden_states: bool = False,
- return_dict: bool = False,
- **kwargs,
- ) -> BaseModelOutputWithPooling | tuple[Tensor, ...]:
- outputs = self.bert_model(
- input_ids=input_ids,
- attention_mask=attention_mask,
- token_type_ids=token_type_ids,
- inputs_embeds=inputs_embeds,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
- sequence_output = outputs[0]
- pooled_output = sequence_output[:, 0, :]
- if self.projection_dim > 0:
- pooled_output = self.encode_proj(pooled_output)
- if not return_dict:
- return (sequence_output, pooled_output) + outputs[2:]
- return BaseModelOutputWithPooling(
- last_hidden_state=sequence_output,
- pooler_output=pooled_output,
- hidden_states=outputs.hidden_states,
- attentions=outputs.attentions,
- )
- @property
- def embeddings_size(self) -> int:
- if self.projection_dim > 0:
- return self.encode_proj.out_features
- return self.bert_model.config.hidden_size
- class DPRSpanPredictor(DPRPreTrainedModel):
- base_model_prefix = "encoder"
- def __init__(self, config: DPRConfig):
- super().__init__(config)
- self.encoder = DPREncoder(config)
- self.qa_outputs = nn.Linear(self.encoder.embeddings_size, 2)
- self.qa_classifier = nn.Linear(self.encoder.embeddings_size, 1)
- # Initialize weights and apply final processing
- self.post_init()
- def forward(
- self,
- input_ids: Tensor,
- attention_mask: Tensor,
- inputs_embeds: Tensor | None = None,
- output_attentions: bool = False,
- output_hidden_states: bool = False,
- return_dict: bool = False,
- **kwargs,
- ) -> DPRReaderOutput | tuple[Tensor, ...]:
- # notations: N - number of questions in a batch, M - number of passages per questions, L - sequence length
- n_passages, sequence_length = input_ids.size() if input_ids is not None else inputs_embeds.size()[:2]
- # feed encoder
- outputs = self.encoder(
- input_ids,
- attention_mask=attention_mask,
- inputs_embeds=inputs_embeds,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
- sequence_output = outputs[0]
- # compute logits
- logits = self.qa_outputs(sequence_output)
- start_logits, end_logits = logits.split(1, dim=-1)
- start_logits = start_logits.squeeze(-1).contiguous()
- end_logits = end_logits.squeeze(-1).contiguous()
- relevance_logits = self.qa_classifier(sequence_output[:, 0, :])
- # resize
- start_logits = start_logits.view(n_passages, sequence_length)
- end_logits = end_logits.view(n_passages, sequence_length)
- relevance_logits = relevance_logits.view(n_passages)
- if not return_dict:
- return (start_logits, end_logits, relevance_logits) + outputs[2:]
- return DPRReaderOutput(
- start_logits=start_logits,
- end_logits=end_logits,
- relevance_logits=relevance_logits,
- hidden_states=outputs.hidden_states,
- attentions=outputs.attentions,
- )
- ##################
- # PreTrainedModel
- ##################
- class DPRPretrainedContextEncoder(DPRPreTrainedModel):
- """
- An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
- models.
- """
- config: DPRConfig
- base_model_prefix = "ctx_encoder"
- class DPRPretrainedQuestionEncoder(DPRPreTrainedModel):
- """
- An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
- models.
- """
- config: DPRConfig
- base_model_prefix = "question_encoder"
- class DPRPretrainedReader(DPRPreTrainedModel):
- """
- An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
- models.
- """
- config: DPRConfig
- base_model_prefix = "span_predictor"
- ###############
- # Actual Models
- ###############
- @auto_docstring(
- custom_intro="""
- The bare DPRContextEncoder transformer outputting pooler outputs as context representations.
- """
- )
- class DPRContextEncoder(DPRPretrainedContextEncoder):
- def __init__(self, config: DPRConfig):
- super().__init__(config)
- self.config = config
- self.ctx_encoder = DPREncoder(config)
- # Initialize weights and apply final processing
- self.post_init()
- @auto_docstring
- def forward(
- self,
- input_ids: Tensor | None = None,
- attention_mask: Tensor | None = None,
- token_type_ids: Tensor | None = None,
- inputs_embeds: Tensor | None = None,
- output_attentions: bool | None = None,
- output_hidden_states: bool | None = None,
- return_dict: bool | None = None,
- **kwargs,
- ) -> DPRContextEncoderOutput | tuple[Tensor, ...]:
- r"""
- input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
- Indices of input sequence tokens in the vocabulary. To match pretraining, DPR input sequence should be
- formatted with [CLS] and [SEP] tokens as follows:
- (a) For sequence pairs (for a pair title+text for example):
- ```
- tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
- token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
- ```
- (b) For single sequences (for a question for example):
- ```
- tokens: [CLS] the dog is hairy . [SEP]
- token_type_ids: 0 0 0 0 0 0 0
- ```
- DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
- rather than the left.
- Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
- [`PreTrainedTokenizer.__call__`] for details.
- [What are input IDs?](../glossary#input-ids)
- Examples:
- ```python
- >>> from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
- >>> tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
- >>> model = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
- >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors="pt")["input_ids"]
- >>> embeddings = model(input_ids).pooler_output
- ```"""
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
- output_hidden_states = (
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
- )
- return_dict = return_dict if return_dict is not None else self.config.return_dict
- if input_ids is not None and inputs_embeds is not None:
- raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
- elif input_ids is not None:
- input_shape = input_ids.size()
- elif inputs_embeds is not None:
- input_shape = inputs_embeds.size()[:-1]
- else:
- raise ValueError("You have to specify either input_ids or inputs_embeds")
- device = input_ids.device if input_ids is not None else inputs_embeds.device
- if attention_mask is None:
- attention_mask = (
- torch.ones(input_shape, device=device)
- if input_ids is None
- else (input_ids != self.config.pad_token_id)
- )
- if token_type_ids is None:
- token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
- outputs = self.ctx_encoder(
- input_ids=input_ids,
- attention_mask=attention_mask,
- token_type_ids=token_type_ids,
- inputs_embeds=inputs_embeds,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
- if not return_dict:
- return outputs[1:]
- return DPRContextEncoderOutput(
- pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions
- )
- @auto_docstring(
- custom_intro="""
- The bare DPRQuestionEncoder transformer outputting pooler outputs as question representations.
- """
- )
- class DPRQuestionEncoder(DPRPretrainedQuestionEncoder):
- def __init__(self, config: DPRConfig):
- super().__init__(config)
- self.config = config
- self.question_encoder = DPREncoder(config)
- # Initialize weights and apply final processing
- self.post_init()
- @auto_docstring
- def forward(
- self,
- input_ids: Tensor | None = None,
- attention_mask: Tensor | None = None,
- token_type_ids: Tensor | None = None,
- inputs_embeds: Tensor | None = None,
- output_attentions: bool | None = None,
- output_hidden_states: bool | None = None,
- return_dict: bool | None = None,
- **kwargs,
- ) -> DPRQuestionEncoderOutput | tuple[Tensor, ...]:
- r"""
- input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
- Indices of input sequence tokens in the vocabulary. To match pretraining, DPR input sequence should be
- formatted with [CLS] and [SEP] tokens as follows:
- (a) For sequence pairs (for a pair title+text for example):
- ```
- tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
- token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
- ```
- (b) For single sequences (for a question for example):
- ```
- tokens: [CLS] the dog is hairy . [SEP]
- token_type_ids: 0 0 0 0 0 0 0
- ```
- DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
- rather than the left.
- Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
- [`PreTrainedTokenizer.__call__`] for details.
- [What are input IDs?](../glossary#input-ids)
- Examples:
- ```python
- >>> from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
- >>> tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
- >>> model = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
- >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors="pt")["input_ids"]
- >>> embeddings = model(input_ids).pooler_output
- ```
- """
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
- output_hidden_states = (
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
- )
- return_dict = return_dict if return_dict is not None else self.config.return_dict
- if input_ids is not None and inputs_embeds is not None:
- raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
- elif input_ids is not None:
- self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
- input_shape = input_ids.size()
- elif inputs_embeds is not None:
- input_shape = inputs_embeds.size()[:-1]
- else:
- raise ValueError("You have to specify either input_ids or inputs_embeds")
- device = input_ids.device if input_ids is not None else inputs_embeds.device
- if attention_mask is None:
- attention_mask = (
- torch.ones(input_shape, device=device)
- if input_ids is None
- else (input_ids != self.config.pad_token_id)
- )
- if token_type_ids is None:
- token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
- outputs = self.question_encoder(
- input_ids=input_ids,
- attention_mask=attention_mask,
- token_type_ids=token_type_ids,
- inputs_embeds=inputs_embeds,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
- if not return_dict:
- return outputs[1:]
- return DPRQuestionEncoderOutput(
- pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions
- )
- @auto_docstring(
- custom_intro="""
- The bare DPRReader transformer outputting span predictions.
- """
- )
- class DPRReader(DPRPretrainedReader):
- def __init__(self, config: DPRConfig):
- super().__init__(config)
- self.config = config
- self.span_predictor = DPRSpanPredictor(config)
- # Initialize weights and apply final processing
- self.post_init()
- @auto_docstring
- def forward(
- self,
- input_ids: Tensor | None = None,
- attention_mask: Tensor | None = None,
- inputs_embeds: Tensor | None = None,
- output_attentions: bool | None = None,
- output_hidden_states: bool | None = None,
- return_dict: bool | None = None,
- **kwargs,
- ) -> DPRReaderOutput | tuple[Tensor, ...]:
- r"""
- input_ids (`tuple[torch.LongTensor]` of shapes `(n_passages, sequence_length)`):
- Indices of input sequence tokens in the vocabulary. It has to be a sequence triplet with 1) the question
- and 2) the passages titles and 3) the passages texts To match pretraining, DPR `input_ids` sequence should
- be formatted with [CLS] and [SEP] with the format:
- `[CLS] <question token ids> [SEP] <titles ids> [SEP] <texts ids>`
- DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
- rather than the left.
- Indices can be obtained using [`DPRReaderTokenizer`]. See this class documentation for more details.
- [What are input IDs?](../glossary#input-ids)
- inputs_embeds (`torch.FloatTensor` of shape `(n_passages, sequence_length, hidden_size)`, *optional*):
- Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
- is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
- model's internal embedding lookup matrix.
- Examples:
- ```python
- >>> from transformers import DPRReader, DPRReaderTokenizer
- >>> tokenizer = DPRReaderTokenizer.from_pretrained("facebook/dpr-reader-single-nq-base")
- >>> model = DPRReader.from_pretrained("facebook/dpr-reader-single-nq-base")
- >>> encoded_inputs = tokenizer(
- ... questions=["What is love ?"],
- ... titles=["Haddaway"],
- ... texts=["'What Is Love' is a song recorded by the artist Haddaway"],
- ... return_tensors="pt",
- ... )
- >>> outputs = model(**encoded_inputs)
- >>> start_logits = outputs.start_logits
- >>> end_logits = outputs.end_logits
- >>> relevance_logits = outputs.relevance_logits
- ```
- """
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
- output_hidden_states = (
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
- )
- return_dict = return_dict if return_dict is not None else self.config.return_dict
- if input_ids is not None and inputs_embeds is not None:
- raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
- elif input_ids is not None:
- self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
- input_shape = input_ids.size()
- elif inputs_embeds is not None:
- input_shape = inputs_embeds.size()[:-1]
- else:
- raise ValueError("You have to specify either input_ids or inputs_embeds")
- device = input_ids.device if input_ids is not None else inputs_embeds.device
- if attention_mask is None:
- attention_mask = torch.ones(input_shape, device=device)
- return self.span_predictor(
- input_ids,
- attention_mask,
- inputs_embeds=inputs_embeds,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
- __all__ = [
- "DPRContextEncoder",
- "DPRPretrainedContextEncoder",
- "DPRPreTrainedModel",
- "DPRPretrainedQuestionEncoder",
- "DPRPretrainedReader",
- "DPRQuestionEncoder",
- "DPRReader",
- ]
|