buffile.c source code [PostgreSQL/src/backend/storage/file/buffile.c]

1	/-------------------------------------------------------------------------*
2	*
3	* buffile.c
4	* Management of large buffered temporary files.
5	*
6	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7	* Portions Copyright (c) 1994, Regents of the University of California
8	*
9	* IDENTIFICATION
10	* src/backend/storage/file/buffile.c
11	*
12	* NOTES:
13	*
14	* BufFiles provide a very incomplete emulation of stdio atop virtual Files
15	* (as managed by fd.c). Currently, we only support the buffered-I/O
16	* aspect of stdio: a read or write of the low-level File occurs only
17	* when the buffer is filled or emptied. This is an even bigger win
18	* for virtual Files than for ordinary kernel files, since reducing the
19	* frequency with which a virtual File is touched reduces "thrashing"
20	* of opening/closing file descriptors.
21	*
22	* Note that BufFile structs are allocated with palloc(), and therefore
23	* will go away automatically at query/transaction end. Since the underlying
24	* virtual Files are made with OpenTemporaryFile, all resources for
25	* the file are certain to be cleaned up even if processing is aborted
26	* by ereport(ERROR). The data structures required are made in the
27	* palloc context that was current when the BufFile was created, and
28	* any external resources such as temp files are owned by the ResourceOwner
29	* that was current at that time.
30	*
31	* BufFile also supports temporary files that exceed the OS file size limit
32	* (by opening multiple fd.c temporary files). This is an essential feature
33	* for sorts and hashjoins on large amounts of data.
34	*
35	* BufFile supports temporary files that can be made read-only and shared with
36	* other backends, as infrastructure for parallel execution. Such files need
37	* to be created as a member of a SharedFileSet that all participants are
38	* attached to.
39	*-------------------------------------------------------------------------
40	*/
41
42	#include "postgres.h"
43
44	#include "commands/tablespace.h"
45	#include "executor/instrument.h"
46	#include "miscadmin.h"
47	#include "pgstat.h"
48	#include "storage/fd.h"
49	#include "storage/buffile.h"
50	#include "storage/buf_internals.h"
51	#include "utils/resowner.h"
52
53	/*
54	* We break BufFiles into gigabyte-sized segments, regardless of RELSEG_SIZE.
55	* The reason is that we'd like large BufFiles to be spread across multiple
56	* tablespaces when available.
57	*/
58	#define MAX_PHYSICAL_FILESIZE 0x40000000
59	#define BUFFILE_SEG_SIZE (MAX_PHYSICAL_FILESIZE / BLCKSZ)
60
61	/*
62	* This data structure represents a buffered file that consists of one or
63	* more physical files (each accessed through a virtual file descriptor
64	* managed by fd.c).
65	*/
66	struct BufFile
67	{
68	int numFiles; / number of physical files in set /
69	/ all files except the last have length exactly MAX_PHYSICAL_FILESIZE /
70	File files; /* palloc'd array with numFiles entries /
71
72	bool isInterXact; / keep open over transactions? /
73	bool dirty; / does buffer need to be written? /
74	bool readOnly; / has the file been set to read only? /
75
76	SharedFileSet fileset; /* space for segment files if shared /
77	const char name; /* name of this BufFile if shared /
78
79	/*
80	* resowner is the ResourceOwner to use for underlying temp files. (We
81	* don't need to remember the memory context we're using explicitly,
82	* because after creation we only repalloc our arrays larger.)
83	*/
84	ResourceOwner resowner;
85
86	/*
87	* "current pos" is position of start of buffer within the logical file.
88	* Position as seen by user of BufFile is (curFile, curOffset + pos).
89	*/
90	int curFile; / file index (0..n) part of current pos /
91	off_t curOffset; / offset part of current pos /
92	int pos; / next read/write position in buffer /
93	int nbytes; / total # of valid bytes in buffer /
94	PGAlignedBlock buffer;
95	};
96
97	static BufFile makeBufFileCommon(int* nfiles);
98	static BufFile *makeBufFile(File firstfile);
99	static void extendBufFile(BufFile *file);
100	static void BufFileLoadBuffer(BufFile *file);
101	static void BufFileDumpBuffer(BufFile *file);
102	static int BufFileFlush(BufFile *file);
103	static File MakeNewSharedSegment(BufFile file, int* segment);
104
105	/*
106	* Create BufFile and perform the common initialization.
107	*/
108	static BufFile *
109	makeBufFileCommon(int nfiles)
110	{
111	BufFile file = (BufFile ) palloc(sizeof(BufFile));
112
113	file->numFiles = nfiles;
114	file->isInterXact = false;
115	file->dirty = false;
116	file->resowner = CurrentResourceOwner;
117	file->curFile = `0`;
118	file->curOffset = `0L`;
119	file->pos = `0`;
120	file->nbytes = `0`;
121
122	return file;
123	}
124
125	/*
126	* Create a BufFile given the first underlying physical file.
127	* NOTE: caller must set isInterXact if appropriate.
128	*/
129	static BufFile *
130	makeBufFile(File firstfile)
131	{
132	BufFile *file = makeBufFileCommon(`1`);
133
134	file->files = (File ) palloc(sizeof*(File));
135	file->files[`0`] = firstfile;
136	file->readOnly = false;
137	file->fileset = NULL;
138	file->name = NULL;
139
140	return file;
141	}
142
143	/*
144	* Add another component temp file.
145	*/
146	static void
147	extendBufFile(BufFile *file)
148	{
149	File pfile;
150	ResourceOwner oldowner;
151
152	/ Be sure to associate the file with the BufFile's resource owner /
153	oldowner = CurrentResourceOwner;
154	CurrentResourceOwner = file->resowner;
155
156	if (file->fileset == NULL)
157	pfile = OpenTemporaryFile(file->isInterXact);
158	else
159	pfile = MakeNewSharedSegment(file, file->numFiles);
160
161	Assert(pfile >= `0`);
162
163	CurrentResourceOwner = oldowner;
164
165	file->files = (File *) repalloc(file->files,
166	(file->numFiles + `1`) * sizeof(File));
167	file->files[file->numFiles] = pfile;
168	file->numFiles++;
169	}
170
171	/*
172	* Create a BufFile for a new temporary file (which will expand to become
173	* multiple temporary files if more than MAX_PHYSICAL_FILESIZE bytes are
174	* written to it).
175	*
176	* If interXact is true, the temp file will not be automatically deleted
177	* at end of transaction.
178	*
179	* Note: if interXact is true, the caller had better be calling us in a
180	* memory context, and with a resource owner, that will survive across
181	* transaction boundaries.
182	*/
183	BufFile *
184	BufFileCreateTemp(bool interXact)
185	{
186	BufFile *file;
187	File pfile;
188
189	/*
190	* Ensure that temp tablespaces are set up for OpenTemporaryFile to use.
191	* Possibly the caller will have done this already, but it seems useful to
192	* double-check here. Failure to do this at all would result in the temp
193	* files always getting placed in the default tablespace, which is a
194	* pretty hard-to-detect bug. Callers may prefer to do it earlier if they
195	* want to be sure that any required catalog access is done in some other
196	* resource context.
197	*/
198	PrepareTempTablespaces();
199
200	pfile = OpenTemporaryFile(interXact);
201	Assert(pfile >= `0`);
202
203	file = makeBufFile(pfile);
204	file->isInterXact = interXact;
205
206	return file;
207	}
208
209	/*
210	* Build the name for a given segment of a given BufFile.
211	*/
212	static void
213	SharedSegmentName(char name, const* char buffile_name, int* segment)
214	{
215	snprintf(name, MAXPGPATH, "%s.%d", buffile_name, segment);
216	}
217
218	/*
219	* Create a new segment file backing a shared BufFile.
220	*/
221	static File
222	MakeNewSharedSegment(BufFile buffile, int* segment)
223	{
224	char name[MAXPGPATH];
225	File file;
226
227	/*
228	* It is possible that there are files left over from before a crash
229	* restart with the same name. In order for BufFileOpenShared() not to
230	* get confused about how many segments there are, we'll unlink the next
231	* segment number if it already exists.
232	*/
233	SharedSegmentName(name, buffile->name, segment + `1`);
234	SharedFileSetDelete(buffile->fileset, name, true);
235
236	/ Create the new segment. /
237	SharedSegmentName(name, buffile->name, segment);
238	file = SharedFileSetCreate(buffile->fileset, name);
239
240	/ SharedFileSetCreate would've errored out /
241	Assert(file > `0`);
242
243	return file;
244	}
245
246	/*
247	* Create a BufFile that can be discovered and opened read-only by other
248	* backends that are attached to the same SharedFileSet using the same name.
249	*
250	* The naming scheme for shared BufFiles is left up to the calling code. The
251	* name will appear as part of one or more filenames on disk, and might
252	* provide clues to administrators about which subsystem is generating
253	* temporary file data. Since each SharedFileSet object is backed by one or
254	* more uniquely named temporary directory, names don't conflict with
255	* unrelated SharedFileSet objects.
256	*/
257	BufFile *
258	BufFileCreateShared(SharedFileSet fileset, const* char *name)
259	{
260	BufFile *file;
261
262	file = makeBufFileCommon(`1`);
263	file->fileset = fileset;
264	file->name = pstrdup(name);
265	file->files = (File ) palloc(sizeof*(File));
266	file->files[`0`] = MakeNewSharedSegment(file, `0`);
267	file->readOnly = false;
268
269	return file;
270	}
271
272	/*
273	* Open a file that was previously created in another backend (or this one)
274	* with BufFileCreateShared in the same SharedFileSet using the same name.
275	* The backend that created the file must have called BufFileClose() or
276	* BufFileExportShared() to make sure that it is ready to be opened by other
277	* backends and render it read-only.
278	*/
279	BufFile *
280	BufFileOpenShared(SharedFileSet fileset, const* char *name)
281	{
282	BufFile *file;
283	char segment_name[MAXPGPATH];
284	Size capacity = `16`;
285	File *files;
286	int nfiles = `0`;
287
288	files = palloc(sizeof(File) * capacity);
289
290	/*
291	* We don't know how many segments there are, so we'll probe the
292	* filesystem to find out.
293	*/
294	for (;;)
295	{
296	/ See if we need to expand our file segment array. /
297	if (nfiles + `1` > capacity)
298	{
299	capacity *= `2`;
300	files = repalloc(files, sizeof(File) * capacity);
301	}
302	/ Try to load a segment. /
303	SharedSegmentName(segment_name, name, nfiles);
304	files[nfiles] = SharedFileSetOpen(fileset, segment_name);
305	if (files[nfiles] <= `0`)
306	break;
307	++nfiles;
308
309	CHECK_FOR_INTERRUPTS();
310	}
311
312	/*
313	* If we didn't find any files at all, then no BufFile exists with this
314	* name.
315	*/
316	if (nfiles == `0`)
317	ereport(ERROR,
318	(errcode_for_file_access(),
319	errmsg("could not open temporary file \"%s\" from BufFile \"%s\": %m",
320	segment_name, name)));
321
322	file = makeBufFileCommon(nfiles);
323	file->files = files;
324	file->readOnly = true; / Can't write to files opened this way /
325	file->fileset = fileset;
326	file->name = pstrdup(name);
327
328	return file;
329	}
330
331	/*
332	* Delete a BufFile that was created by BufFileCreateShared in the given
333	* SharedFileSet using the given name.
334	*
335	* It is not necessary to delete files explicitly with this function. It is
336	* provided only as a way to delete files proactively, rather than waiting for
337	* the SharedFileSet to be cleaned up.
338	*
339	* Only one backend should attempt to delete a given name, and should know
340	* that it exists and has been exported or closed.
341	*/
342	void
343	BufFileDeleteShared(SharedFileSet fileset, const* char *name)
344	{
345	char segment_name[MAXPGPATH];
346	int segment = `0`;
347	bool found = false;
348
349	/*
350	* We don't know how many segments the file has. We'll keep deleting
351	* until we run out. If we don't manage to find even an initial segment,
352	* raise an error.
353	*/
354	for (;;)
355	{
356	SharedSegmentName(segment_name, name, segment);
357	if (!SharedFileSetDelete(fileset, segment_name, true))
358	break;
359	found = true;
360	++segment;
361
362	CHECK_FOR_INTERRUPTS();
363	}
364
365	if (!found)
366	elog(ERROR, "could not delete unknown shared BufFile \"%s\"", name);
367	}
368
369	/*
370	* BufFileExportShared --- flush and make read-only, in preparation for sharing.
371	*/
372	void
373	BufFileExportShared(BufFile *file)
374	{
375	/ Must be a file belonging to a SharedFileSet. /
376	Assert(file->fileset != NULL);
377
378	/ It's probably a bug if someone calls this twice. /
379	Assert(!file->readOnly);
380
381	BufFileFlush(file);
382	file->readOnly = true;
383	}
384
385	/*
386	* Close a BufFile
387	*
388	* Like fclose(), this also implicitly FileCloses the underlying File.
389	*/
390	void
391	BufFileClose(BufFile *file)
392	{
393	int i;
394
395	/ flush any unwritten data /
396	BufFileFlush(file);
397	/ close and delete the underlying file(s) /
398	for (i = `0`; i < file->numFiles; i++)
399	FileClose(file->files[i]);
400	/ release the buffer space /
401	pfree(file->files);
402	pfree(file);
403	}
404
405	/*
406	* BufFileLoadBuffer
407	*
408	* Load some data into buffer, if possible, starting from curOffset.
409	* At call, must have dirty = false, pos and nbytes = 0.
410	* On exit, nbytes is number of bytes loaded.
411	*/
412	static void
413	BufFileLoadBuffer(BufFile *file)
414	{
415	File thisfile;
416
417	/*
418	* Advance to next component file if necessary and possible.
419	*/
420	if (file->curOffset >= MAX_PHYSICAL_FILESIZE &&
421	file->curFile + `1` < file->numFiles)
422	{
423	file->curFile++;
424	file->curOffset = `0L`;
425	}
426
427	/*
428	* Read whatever we can get, up to a full bufferload.
429	*/
430	thisfile = file->files[file->curFile];
431	file->nbytes = FileRead(thisfile,
432	file->buffer.data,
433	sizeof(file->buffer),
434	file->curOffset,
435	WAIT_EVENT_BUFFILE_READ);
436	if (file->nbytes < `0`)
437	file->nbytes = `0`;
438	/ we choose not to advance curOffset here /
439
440	if (file->nbytes > `0`)
441	pgBufferUsage.temp_blks_read++;
442	}
443
444	/*
445	* BufFileDumpBuffer
446	*
447	* Dump buffer contents starting at curOffset.
448	* At call, should have dirty = true, nbytes > 0.
449	* On exit, dirty is cleared if successful write, and curOffset is advanced.
450	*/
451	static void
452	BufFileDumpBuffer(BufFile *file)
453	{
454	int wpos = `0`;
455	int bytestowrite;
456	File thisfile;
457
458	/*
459	* Unlike BufFileLoadBuffer, we must dump the whole buffer even if it
460	* crosses a component-file boundary; so we need a loop.
461	*/
462	while (wpos < file->nbytes)
463	{
464	off_t availbytes;
465
466	/*
467	* Advance to next component file if necessary and possible.
468	*/
469	if (file->curOffset >= MAX_PHYSICAL_FILESIZE)
470	{
471	while (file->curFile + `1` >= file->numFiles)
472	extendBufFile(file);
473	file->curFile++;
474	file->curOffset = `0L`;
475	}
476
477	/*
478	* Determine how much we need to write into this file.
479	*/
480	bytestowrite = file->nbytes - wpos;
481	availbytes = MAX_PHYSICAL_FILESIZE - file->curOffset;
482
483	if ((off_t) bytestowrite > availbytes)
484	bytestowrite = (int) availbytes;
485
486	thisfile = file->files[file->curFile];
487	bytestowrite = FileWrite(thisfile,
488	file->buffer.data + wpos,
489	bytestowrite,
490	file->curOffset,
491	WAIT_EVENT_BUFFILE_WRITE);
492	if (bytestowrite <= `0`)
493	return; / failed to write /
494	file->curOffset += bytestowrite;
495	wpos += bytestowrite;
496
497	pgBufferUsage.temp_blks_written++;
498	}
499	file->dirty = false;
500
501	/*
502	* At this point, curOffset has been advanced to the end of the buffer,
503	* ie, its original value + nbytes. We need to make it point to the
504	* logical file position, ie, original value + pos, in case that is less
505	* (as could happen due to a small backwards seek in a dirty buffer!)
506	*/
507	file->curOffset -= (file->nbytes - file->pos);
508	if (file->curOffset < `0`) / handle possible segment crossing /
509	{
510	file->curFile--;
511	Assert(file->curFile >= `0`);
512	file->curOffset += MAX_PHYSICAL_FILESIZE;
513	}
514
515	/*
516	* Now we can set the buffer empty without changing the logical position
517	*/
518	file->pos = `0`;
519	file->nbytes = `0`;
520	}
521
522	/*
523	* BufFileRead
524	*
525	* Like fread() except we assume 1-byte element size.
526	*/
527	size_t
528	BufFileRead(BufFile file, void* *ptr, size_t size)
529	{
530	size_t nread = `0`;
531	size_t nthistime;
532
533	if (file->dirty)
534	{
535	if (BufFileFlush(file) != `0`)
536	return `0`; / could not flush... /
537	Assert(!file->dirty);
538	}
539
540	while (size > `0`)
541	{
542	if (file->pos >= file->nbytes)
543	{
544	/ Try to load more data into buffer. /
545	file->curOffset += file->pos;
546	file->pos = `0`;
547	file->nbytes = `0`;
548	BufFileLoadBuffer(file);
549	if (file->nbytes <= `0`)
550	break; / no more data available /
551	}
552
553	nthistime = file->nbytes - file->pos;
554	if (nthistime > size)
555	nthistime = size;
556	Assert(nthistime > `0`);
557
558	memcpy(ptr, file->buffer.data + file->pos, nthistime);
559
560	file->pos += nthistime;
561	ptr = (void ) ((char* *) ptr + nthistime);
562	size -= nthistime;
563	nread += nthistime;
564	}
565
566	return nread;
567	}
568
569	/*
570	* BufFileWrite
571	*
572	* Like fwrite() except we assume 1-byte element size.
573	*/
574	size_t
575	BufFileWrite(BufFile file, void* *ptr, size_t size)
576	{
577	size_t nwritten = `0`;
578	size_t nthistime;
579
580	Assert(!file->readOnly);
581
582	while (size > `0`)
583	{
584	if (file->pos >= BLCKSZ)
585	{
586	/ Buffer full, dump it out /
587	if (file->dirty)
588	{
589	BufFileDumpBuffer(file);
590	if (file->dirty)
591	break; / I/O error /
592	}
593	else
594	{
595	/ Hmm, went directly from reading to writing? /
596	file->curOffset += file->pos;
597	file->pos = `0`;
598	file->nbytes = `0`;
599	}
600	}
601
602	nthistime = BLCKSZ - file->pos;
603	if (nthistime > size)
604	nthistime = size;
605	Assert(nthistime > `0`);
606
607	memcpy(file->buffer.data + file->pos, ptr, nthistime);
608
609	file->dirty = true;
610	file->pos += nthistime;
611	if (file->nbytes < file->pos)
612	file->nbytes = file->pos;
613	ptr = (void ) ((char* *) ptr + nthistime);
614	size -= nthistime;
615	nwritten += nthistime;
616	}
617
618	return nwritten;
619	}
620
621	/*
622	* BufFileFlush
623	*
624	* Like fflush()
625	*/
626	static int
627	BufFileFlush(BufFile *file)
628	{
629	if (file->dirty)
630	{
631	BufFileDumpBuffer(file);
632	if (file->dirty)
633	return EOF;
634	}
635
636	return `0`;
637	}
638
639	/*
640	* BufFileSeek
641	*
642	* Like fseek(), except that target position needs two values in order to
643	* work when logical filesize exceeds maximum value representable by off_t.
644	* We do not support relative seeks across more than that, however.
645	*
646	* Result is 0 if OK, EOF if not. Logical position is not moved if an
647	* impossible seek is attempted.
648	*/
649	int
650	BufFileSeek(BufFile file, int* fileno, off_t offset, int whence)
651	{
652	int newFile;
653	off_t newOffset;
654
655	switch (whence)
656	{
657	case SEEK_SET:
658	if (fileno < `0`)
659	return EOF;
660	newFile = fileno;
661	newOffset = offset;
662	break;
663	case SEEK_CUR:
664
665	/*
666	* Relative seek considers only the signed offset, ignoring
667	* fileno. Note that large offsets (> 1 gig) risk overflow in this
668	* add, unless we have 64-bit off_t.
669	*/
670	newFile = file->curFile;
671	newOffset = (file->curOffset + file->pos) + offset;
672	break;
673	#ifdef NOT_USED
674	case SEEK_END:
675	/ could be implemented, not needed currently /
676	break;
677	#endif
678	default:
679	elog(ERROR, "invalid whence: %d", whence);
680	return EOF;
681	}
682	while (newOffset < `0`)
683	{
684	if (--newFile < `0`)
685	return EOF;
686	newOffset += MAX_PHYSICAL_FILESIZE;
687	}
688	if (newFile == file->curFile &&
689	newOffset >= file->curOffset &&
690	newOffset <= file->curOffset + file->nbytes)
691	{
692	/*
693	* Seek is to a point within existing buffer; we can just adjust
694	* pos-within-buffer, without flushing buffer. Note this is OK
695	* whether reading or writing, but buffer remains dirty if we were
696	* writing.
697	*/
698	file->pos = (int) (newOffset - file->curOffset);
699	return `0`;
700	}
701	/ Otherwise, must reposition buffer, so flush any dirty data /
702	if (BufFileFlush(file) != `0`)
703	return EOF;
704
705	/*
706	* At this point and no sooner, check for seek past last segment. The
707	* above flush could have created a new segment, so checking sooner would
708	* not work (at least not with this code).
709	*/
710
711	/ convert seek to "start of next seg" to "end of last seg" /
712	if (newFile == file->numFiles && newOffset == `0`)
713	{
714	newFile--;
715	newOffset = MAX_PHYSICAL_FILESIZE;
716	}
717	while (newOffset > MAX_PHYSICAL_FILESIZE)
718	{
719	if (++newFile >= file->numFiles)
720	return EOF;
721	newOffset -= MAX_PHYSICAL_FILESIZE;
722	}
723	if (newFile >= file->numFiles)
724	return EOF;
725	/ Seek is OK! /
726	file->curFile = newFile;
727	file->curOffset = newOffset;
728	file->pos = `0`;
729	file->nbytes = `0`;
730	return `0`;
731	}
732
733	void
734	BufFileTell(BufFile file, int* fileno, off_t offset)
735	{
736	*fileno = file->curFile;
737	*offset = file->curOffset + file->pos;
738	}
739
740	/*
741	* BufFileSeekBlock --- block-oriented seek
742	*
743	* Performs absolute seek to the start of the n'th BLCKSZ-sized block of
744	* the file. Note that users of this interface will fail if their files
745	* exceed BLCKSZ * LONG_MAX bytes, but that is quite a lot; we don't work
746	* with tables bigger than that, either...
747	*
748	* Result is 0 if OK, EOF if not. Logical position is not moved if an
749	* impossible seek is attempted.
750	*/
751	int
752	BufFileSeekBlock(BufFile file, long* blknum)
753	{
754	return BufFileSeek(file,
755	(int) (blknum / BUFFILE_SEG_SIZE),
756	(off_t) (blknum % BUFFILE_SEG_SIZE) * BLCKSZ,
757	SEEK_SET);
758	}
759
760	#ifdef NOT_USED
761	/*
762	* BufFileTellBlock --- block-oriented tell
763	*
764	* Any fractional part of a block in the current seek position is ignored.
765	*/
766	long
767	BufFileTellBlock(BufFile *file)
768	{
769	long blknum;
770
771	blknum = (file->curOffset + file->pos) / BLCKSZ;
772	blknum += file->curFile * BUFFILE_SEG_SIZE;
773	return blknum;
774	}
775
776	#endif
777
778	/*
779	* Return the current shared BufFile size.
780	*
781	* Counts any holes left behind by BufFileAppend as part of the size.
782	* ereport()s on failure.
783	*/
784	int64
785	BufFileSize(BufFile *file)
786	{
787	int64 lastFileSize;
788
789	Assert(file->fileset != NULL);
790
791	/ Get the size of the last physical file. /
792	lastFileSize = FileSize(file->files[file->numFiles - `1`]);
793	if (lastFileSize < `0`)
794	ereport(ERROR,
795	(errcode_for_file_access(),
796	errmsg("could not determine size of temporary file \"%s\" from BufFile \"%s\": %m",
797	FilePathName(file->files[file->numFiles - `1`]),
798	file->name)));
799
800	return ((file->numFiles - `1`) * (int64) MAX_PHYSICAL_FILESIZE) +
801	lastFileSize;
802	}
803
804	/*
805	* Append the contents of source file (managed within shared fileset) to
806	* end of target file (managed within same shared fileset).
807	*
808	* Note that operation subsumes ownership of underlying resources from
809	* "source". Caller should never call BufFileClose against source having
810	* called here first. Resource owners for source and target must match,
811	* too.
812	*
813	* This operation works by manipulating lists of segment files, so the
814	* file content is always appended at a MAX_PHYSICAL_FILESIZE-aligned
815	* boundary, typically creating empty holes before the boundary. These
816	* areas do not contain any interesting data, and cannot be read from by
817	* caller.
818	*
819	* Returns the block number within target where the contents of source
820	* begins. Caller should apply this as an offset when working off block
821	* positions that are in terms of the original BufFile space.
822	*/
823	long
824	BufFileAppend(BufFile target, BufFile source)
825	{
826	long startBlock = target->numFiles * BUFFILE_SEG_SIZE;
827	int newNumFiles = target->numFiles + source->numFiles;
828	int i;
829
830	Assert(target->fileset != NULL);
831	Assert(source->readOnly);
832	Assert(!source->dirty);
833	Assert(source->fileset != NULL);
834
835	if (target->resowner != source->resowner)
836	elog(ERROR, "could not append BufFile with non-matching resource owner");
837
838	target->files = (File *)
839	repalloc(target->files, sizeof(File) * newNumFiles);
840	for (i = target->numFiles; i < newNumFiles; i++)
841	target->files[i] = source->files[i - target->numFiles];
842	target->numFiles = newNumFiles;
843
844	return startBlock;
845	}
846

Browse the source code of PostgreSQL/src/backend/storage/file/buffile.c